diff --git a/.circleci/config.yml b/.circleci/config.yml index 70da5dd90f2a89..b87a0aeabb8c5d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -134,11 +134,10 @@ jobs: command: pip freeze | tee installed.txt - store_artifacts: path: ~/transformers/installed.txt - - run: black --check --preview examples tests src utils - - run: isort --check-only examples tests src utils + - run: black --check examples tests src utils + - run: ruff examples tests src utils - run: python utils/custom_init_isort.py --check_only - run: python utils/sort_auto_mappings.py --check_only - - run: flake8 examples tests src utils - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source - run: python utils/check_doc_toc.py diff --git a/Makefile b/Makefile index 2febcfe85ebdf1..0ca30634790955 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,8 @@ modified_only_fixup: $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs))) @if test -n "$(modified_py_files)"; then \ echo "Checking/fixing $(modified_py_files)"; \ - black --preview $(modified_py_files); \ - isort $(modified_py_files); \ - flake8 $(modified_py_files); \ + black $(modified_py_files); \ + ruff $(modified_py_files) --fix; \ else \ echo "No library .py files were modified"; \ fi @@ -48,11 +47,10 @@ repo-consistency: # this target runs checks on all files quality: - black --check --preview $(check_dirs) - isort --check-only $(check_dirs) + black --check $(check_dirs) python utils/custom_init_isort.py --check_only python utils/sort_auto_mappings.py --check_only - flake8 $(check_dirs) + ruff $(check_dirs) doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source python utils/check_doc_toc.py @@ -67,8 +65,8 @@ extra_style_checks: # this target runs checks on all files and potentially modifies some of them style: - black --preview $(check_dirs) - isort $(check_dirs) + black $(check_dirs) + ruff $(check_dirs) --fix ${MAKE} autogenerate_code ${MAKE} extra_style_checks diff --git a/docs/source/en/pipeline_webserver.mdx b/docs/source/en/pipeline_webserver.mdx index d9f12fa2b3a0d6..f62985ec26b5bb 100644 --- a/docs/source/en/pipeline_webserver.mdx +++ b/docs/source/en/pipeline_webserver.mdx @@ -96,7 +96,7 @@ while True: queues.append(rq) strings outs = pipe(strings, batch_size=len(strings)) -for (rq, out) in zip(queues, outs): +for rq, out in zip(queues, outs): await rq.put(out) ``` diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx index 2b66c96eccadd6..4c31cd2841895f 100644 --- a/docs/source/en/tasks/asr.mdx +++ b/docs/source/en/tasks/asr.mdx @@ -166,7 +166,6 @@ Unlike other data collators, this specific data collator needs to apply a differ >>> @dataclass ... class DataCollatorCTCWithPadding: - ... processor: AutoProcessor ... padding: Union[bool, str] = "longest" diff --git a/docs/source/en/tasks/object_detection.mdx b/docs/source/en/tasks/object_detection.mdx index 7cac02d3ef304d..411ed7d2e7393f 100644 --- a/docs/source/en/tasks/object_detection.mdx +++ b/docs/source/en/tasks/object_detection.mdx @@ -213,7 +213,6 @@ The `image_processor` expects the annotations to be in the following format: `{' ```py >>> def formatted_anns(image_id, category, area, bbox): - ... annotations = [] ... for i in range(0, len(category)): ... new_ann = { @@ -399,6 +398,7 @@ First, prepare the `cppe5["test"]` set: format the annotations and save the data ```py >>> import json + >>> # format annotations the same as for training, no need for data augmentation >>> def val_formatted_anns(image_id, objects): ... annotations = [] diff --git a/docs/source/es/tasks/asr.mdx b/docs/source/es/tasks/asr.mdx index 7d331b11f7eaee..f3747a332d7f42 100644 --- a/docs/source/es/tasks/asr.mdx +++ b/docs/source/es/tasks/asr.mdx @@ -159,7 +159,6 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de >>> @dataclass ... class DataCollatorCTCWithPadding: - ... processor: AutoProcessor ... padding: Union[bool, str] = "longest" diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py index a22437f504e5cb..66bd7290758108 100644 --- a/examples/flax/image-captioning/run_image_captioning_flax.py +++ b/examples/flax/image-captioning/run_image_captioning_flax.py @@ -29,23 +29,23 @@ from typing import Callable, Optional import datasets -import nltk # Here to have a nice missing dependency error message early on -import numpy as np -from datasets import Dataset, load_dataset -from PIL import Image -from tqdm import tqdm - import evaluate import jax import jax.numpy as jnp +import nltk # Here to have a nice missing dependency error message early on +import numpy as np import optax -import transformers +from datasets import Dataset, load_dataset from filelock import FileLock from flax import jax_utils, traverse_util from flax.jax_utils import unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key from huggingface_hub import Repository, create_repo +from PIL import Image +from tqdm import tqdm + +import transformers from transformers import ( AutoImageProcessor, AutoTokenizer, diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py index 2b8d07539e9b5a..0a97bffd9304b0 100644 --- a/examples/flax/language-modeling/run_bart_dlm_flax.py +++ b/examples/flax/language-modeling/run_bart_dlm_flax.py @@ -32,20 +32,20 @@ from pathlib import Path from typing import Dict, List, Optional -import nltk -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import flax import jax import jax.numpy as jnp +import nltk +import numpy as np import optax +from datasets import load_dataset from flax import jax_utils, traverse_util from flax.jax_utils import pad_shard_unpad from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm + from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_MASKED_LM_MAPPING, diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 0d516878bcb684..607c9bb1ee7c88 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -34,19 +34,19 @@ from typing import Callable, Optional import datasets -import numpy as np -from datasets import Dataset, load_dataset -from tqdm import tqdm - import jax import jax.numpy as jnp +import numpy as np import optax -import transformers +from datasets import Dataset, load_dataset from flax import jax_utils, traverse_util from flax.jax_utils import pad_shard_unpad, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key from huggingface_hub import Repository, create_repo +from tqdm import tqdm + +import transformers from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index a2b45b12a224bf..6a06533b14e419 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -34,19 +34,19 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import flax import jax import jax.numpy as jnp +import numpy as np import optax +from datasets import load_dataset from flax import jax_utils, traverse_util from flax.jax_utils import pad_shard_unpad from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm + from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_MASKED_LM_MAPPING, diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index 9fb7bdce0dcd9a..814d68a88e3716 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -33,19 +33,19 @@ from pathlib import Path from typing import Dict, List, Optional -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import flax import jax import jax.numpy as jnp +import numpy as np import optax +from datasets import load_dataset from flax import jax_utils, traverse_util from flax.jax_utils import pad_shard_unpad from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm + from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_MASKED_LM_MAPPING, diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index eb86b98a27ab38..628b9b81b286c0 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -31,20 +31,21 @@ from typing import Any, Callable, Dict, Optional, Tuple import datasets -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import evaluate import jax import jax.numpy as jnp +import numpy as np import optax -import transformers +from datasets import load_dataset from flax import struct, traverse_util from flax.jax_utils import pad_shard_unpad, replicate, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm +from utils_qa import postprocess_qa_predictions + +import transformers from transformers import ( AutoConfig, AutoTokenizer, @@ -55,7 +56,6 @@ is_tensorboard_available, ) from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry -from utils_qa import postprocess_qa_predictions logger = logging.getLogger(__name__) @@ -301,6 +301,7 @@ def __post_init__(self): # endregion + # region Create a train state def create_train_state( model: FlaxAutoModelForQuestionAnswering, @@ -387,6 +388,7 @@ def create_learning_rate_fn( # endregion + # region train data iterator def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int): """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices.""" @@ -405,6 +407,7 @@ def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int): # endregion + # region eval data iterator def eval_data_collator(dataset: Dataset, batch_size: int): """Returns batches of size `batch_size` from `eval dataset`. Sharding handled by `pad_shard_unpad` in the eval loop.""" @@ -934,7 +937,6 @@ def eval_step(state, batch): total_steps = step_per_epoch * num_epochs epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: - train_start = time.time() train_metrics = [] @@ -975,7 +977,6 @@ def eval_step(state, batch): and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0) and cur_step > 0 ): - eval_metrics = {} all_start_logits = [] all_end_logits = [] diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py index 361746e2826e72..feda69592070f0 100644 --- a/examples/flax/summarization/run_summarization_flax.py +++ b/examples/flax/summarization/run_summarization_flax.py @@ -31,22 +31,22 @@ from typing import Callable, Optional import datasets -import nltk # Here to have a nice missing dependency error message early on -import numpy as np -from datasets import Dataset, load_dataset -from tqdm import tqdm - import evaluate import jax import jax.numpy as jnp +import nltk # Here to have a nice missing dependency error message early on +import numpy as np import optax -import transformers +from datasets import Dataset, load_dataset from filelock import FileLock from flax import jax_utils, traverse_util from flax.jax_utils import pad_shard_unpad, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key from huggingface_hub import Repository, create_repo +from tqdm import tqdm + +import transformers from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 6032408c685117..c47ea90d392a3d 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -26,20 +26,20 @@ from typing import Any, Callable, Dict, Optional, Tuple import datasets -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import evaluate import jax import jax.numpy as jnp +import numpy as np import optax -import transformers +from datasets import load_dataset from flax import struct, traverse_util from flax.jax_utils import pad_shard_unpad, replicate, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm + +import transformers from transformers import ( AutoConfig, AutoTokenizer, @@ -586,7 +586,6 @@ def eval_step(state, batch): total_steps = steps_per_epoch * num_epochs epochs = tqdm(range(num_epochs), desc=f"Epoch ... (0/{num_epochs})", position=0) for epoch in epochs: - train_start = time.time() train_metrics = [] @@ -623,7 +622,6 @@ def eval_step(state, batch): train_metrics = [] if (cur_step % training_args.eval_steps == 0 or cur_step % steps_per_epoch == 0) and cur_step > 0: - # evaluate eval_loader = glue_eval_data_collator(eval_dataset, eval_batch_size) for batch in tqdm( diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index a20aa4cfab5cc7..c7509433d95796 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -28,20 +28,20 @@ from typing import Any, Callable, Dict, Optional, Tuple import datasets -import numpy as np -from datasets import ClassLabel, load_dataset -from tqdm import tqdm - import evaluate import jax import jax.numpy as jnp +import numpy as np import optax -import transformers +from datasets import ClassLabel, load_dataset from flax import struct, traverse_util from flax.jax_utils import pad_shard_unpad, replicate, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard from huggingface_hub import Repository, create_repo +from tqdm import tqdm + +import transformers from transformers import ( AutoConfig, AutoTokenizer, @@ -695,7 +695,6 @@ def compute_metrics(): total_steps = step_per_epoch * num_epochs epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: - train_start = time.time() train_metrics = [] @@ -731,7 +730,6 @@ def compute_metrics(): train_metrics = [] if cur_step % training_args.eval_steps == 0 and cur_step > 0: - eval_metrics = {} # evaluate for batch in tqdm( diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py index 33a277fa4f8a21..6a88f0f8d67b28 100644 --- a/examples/flax/vision/run_image_classification.py +++ b/examples/flax/vision/run_image_classification.py @@ -29,21 +29,22 @@ from pathlib import Path from typing import Callable, Optional +import jax +import jax.numpy as jnp +import optax + # for dataset and preprocessing import torch import torchvision import torchvision.transforms as transforms -from tqdm import tqdm - -import jax -import jax.numpy as jnp -import optax -import transformers from flax import jax_utils from flax.jax_utils import pad_shard_unpad, unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key from huggingface_hub import Repository, create_repo +from tqdm import tqdm + +import transformers from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, diff --git a/examples/legacy/multiple_choice/run_multiple_choice.py b/examples/legacy/multiple_choice/run_multiple_choice.py index d8007da6cb676c..451397042594f7 100644 --- a/examples/legacy/multiple_choice/run_multiple_choice.py +++ b/examples/legacy/multiple_choice/run_multiple_choice.py @@ -22,6 +22,7 @@ from typing import Dict, Optional import numpy as np +from utils_multiple_choice import MultipleChoiceDataset, Split, processors import transformers from transformers import ( @@ -36,7 +37,6 @@ set_seed, ) from transformers.trainer_utils import is_main_process -from utils_multiple_choice import MultipleChoiceDataset, Split, processors logger = logging.getLogger(__name__) diff --git a/examples/legacy/multiple_choice/utils_multiple_choice.py b/examples/legacy/multiple_choice/utils_multiple_choice.py index 3dbc3689cc4893..9ffaa7971b5624 100644 --- a/examples/legacy/multiple_choice/utils_multiple_choice.py +++ b/examples/legacy/multiple_choice/utils_multiple_choice.py @@ -26,8 +26,8 @@ from typing import List, Optional import tqdm - from filelock import FileLock + from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available @@ -112,7 +112,6 @@ def __init__( # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): - if os.path.exists(cached_features_file) and not overwrite_cache: logger.info(f"Loading features from cached file {cached_features_file}") self.features = torch.load(cached_features_file) diff --git a/examples/legacy/pytorch-lightning/lightning_base.py b/examples/legacy/pytorch-lightning/lightning_base.py index b3104a25a8b129..f246ecab0dd01b 100644 --- a/examples/legacy/pytorch-lightning/lightning_base.py +++ b/examples/legacy/pytorch-lightning/lightning_base.py @@ -69,7 +69,7 @@ def __init__( config=None, tokenizer=None, model=None, - **config_kwargs + **config_kwargs, ): """Initialize a model, tokenizer and config.""" super().__init__() @@ -346,7 +346,7 @@ def generic_train( extra_callbacks=[], checkpoint_callback=None, logging_callback=None, - **extra_train_kwargs + **extra_train_kwargs, ): pl.seed_everything(args.seed) diff --git a/examples/legacy/pytorch-lightning/run_glue.py b/examples/legacy/pytorch-lightning/run_glue.py index 63b58bcf413c26..aa2349f2809fd4 100644 --- a/examples/legacy/pytorch-lightning/run_glue.py +++ b/examples/legacy/pytorch-lightning/run_glue.py @@ -7,21 +7,19 @@ import numpy as np import torch +from lightning_base import BaseTransformer, add_generic_args, generic_train from torch.utils.data import DataLoader, TensorDataset -from lightning_base import BaseTransformer, add_generic_args, generic_train from transformers import glue_compute_metrics as compute_metrics from transformers import glue_convert_examples_to_features as convert_examples_to_features -from transformers import glue_output_modes +from transformers import glue_output_modes, glue_tasks_num_labels from transformers import glue_processors as processors -from transformers import glue_tasks_num_labels logger = logging.getLogger(__name__) class GLUETransformer(BaseTransformer): - mode = "sequence-classification" def __init__(self, hparams): diff --git a/examples/legacy/pytorch-lightning/run_ner.py b/examples/legacy/pytorch-lightning/run_ner.py index b1bdd125c22eb8..3bcbdfee03b114 100644 --- a/examples/legacy/pytorch-lightning/run_ner.py +++ b/examples/legacy/pytorch-lightning/run_ner.py @@ -7,11 +7,10 @@ import numpy as np import torch +from lightning_base import BaseTransformer, add_generic_args, generic_train from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score from torch.nn import CrossEntropyLoss from torch.utils.data import DataLoader, TensorDataset - -from lightning_base import BaseTransformer, add_generic_args, generic_train from utils_ner import TokenClassificationTask diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py index 674e7a9accbf3a..d966b3f02f0315 100644 --- a/examples/legacy/question-answering/run_squad.py +++ b/examples/legacy/question-answering/run_squad.py @@ -172,7 +172,6 @@ def train(args, train_dataset, model, tokenizer): for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 diff --git a/examples/legacy/question-answering/run_squad_trainer.py b/examples/legacy/question-answering/run_squad_trainer.py index 314b140e828c59..7e3a6f28e0ba1e 100644 --- a/examples/legacy/question-answering/run_squad_trainer.py +++ b/examples/legacy/question-answering/run_squad_trainer.py @@ -30,9 +30,10 @@ DataCollatorWithPadding, HfArgumentParser, SquadDataset, + Trainer, + TrainingArguments, ) from transformers import SquadDataTrainingArguments as DataTrainingArguments -from transformers import Trainer, TrainingArguments from transformers.trainer_utils import is_main_process diff --git a/examples/legacy/run_chinese_ref.py b/examples/legacy/run_chinese_ref.py index f7c09e37ff87d2..7d73580aa21566 100755 --- a/examples/legacy/run_chinese_ref.py +++ b/examples/legacy/run_chinese_ref.py @@ -4,6 +4,7 @@ from typing import List from ltp import LTP + from transformers import BertTokenizer @@ -93,7 +94,6 @@ def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokeni ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): - input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py index f174f7fb5018f9..4e186c96d8c218 100755 --- a/examples/legacy/seq2seq/finetune_trainer.py +++ b/examples/legacy/seq2seq/finetune_trainer.py @@ -19,9 +19,10 @@ from dataclasses import dataclass, field from typing import Optional -import transformers from seq2seq_trainer import Seq2SeqTrainer from seq2seq_training_args import Seq2SeqTrainingArguments + +import transformers from transformers import ( AutoConfig, AutoModelForSeq2SeqLM, @@ -337,7 +338,6 @@ def main(): metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): - handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) diff --git a/examples/legacy/seq2seq/old_test_calculate_rouge.py b/examples/legacy/seq2seq/old_test_calculate_rouge.py index 17b87cb481a650..6cc15e02552be1 100644 --- a/examples/legacy/seq2seq/old_test_calculate_rouge.py +++ b/examples/legacy/seq2seq/old_test_calculate_rouge.py @@ -16,8 +16,8 @@ from pathlib import Path import pandas as pd - from rouge_cli import calculate_rouge_path + from utils import calculate_rouge @@ -87,7 +87,6 @@ def test_single_sent_scores_dont_depend_on_newline_sep(): def test_pegasus_newline(): - pred = [ """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says . "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """ ] diff --git a/examples/legacy/seq2seq/old_test_datasets.py b/examples/legacy/seq2seq/old_test_datasets.py index b85d7966e97090..0b907b1ed9fbb6 100644 --- a/examples/legacy/seq2seq/old_test_datasets.py +++ b/examples/legacy/seq2seq/old_test_datasets.py @@ -17,11 +17,11 @@ import numpy as np import pytest -from torch.utils.data import DataLoader - from pack_dataset import pack_data_dir from parameterized import parameterized from save_len_file import save_len_file +from torch.utils.data import DataLoader + from transformers import AutoTokenizer from transformers.models.mbart.modeling_mbart import shift_tokens_right from transformers.testing_utils import TestCasePlus, slow diff --git a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py index beb7f2bc9857fd..4aefeb388be631 100644 --- a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py +++ b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py @@ -18,6 +18,7 @@ import unittest from parameterized import parameterized + from transformers import FSMTForConditionalGeneration, FSMTTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device from utils import calculate_bleu diff --git a/examples/legacy/seq2seq/old_test_seq2seq_examples.py b/examples/legacy/seq2seq/old_test_seq2seq_examples.py index ecc0524c37d93b..864b97c7466a36 100644 --- a/examples/legacy/seq2seq/old_test_seq2seq_examples.py +++ b/examples/legacy/seq2seq/old_test_seq2seq_examples.py @@ -21,6 +21,7 @@ from parameterized import parameterized from run_eval import run_generate from run_eval_search import run_search + from transformers.testing_utils import CaptureStdout, TestCasePlus, slow from utils import ROUGE_KEYS diff --git a/examples/legacy/seq2seq/pack_dataset.py b/examples/legacy/seq2seq/pack_dataset.py index 6f226de2cc2ddd..8b069e452a7177 100755 --- a/examples/legacy/seq2seq/pack_dataset.py +++ b/examples/legacy/seq2seq/pack_dataset.py @@ -29,7 +29,6 @@ def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024): - finished_src, finished_tgt = [], [] sorted_examples = list(zip(src_examples, tgt_examples)) diff --git a/examples/legacy/seq2seq/run_eval_search.py b/examples/legacy/seq2seq/run_eval_search.py index e1a0c8660c9bf6..c72f038fc50ab2 100755 --- a/examples/legacy/seq2seq/run_eval_search.py +++ b/examples/legacy/seq2seq/run_eval_search.py @@ -20,6 +20,7 @@ from collections import OrderedDict from run_eval import datetime_now, run_generate + from utils import ROUGE_KEYS diff --git a/examples/legacy/seq2seq/seq2seq_training_args.py b/examples/legacy/seq2seq/seq2seq_training_args.py index 6ec220181ad90d..1583acd36fc4b7 100644 --- a/examples/legacy/seq2seq/seq2seq_training_args.py +++ b/examples/legacy/seq2seq/seq2seq_training_args.py @@ -17,6 +17,7 @@ from typing import Optional from seq2seq_trainer import arg_to_scheduler + from transformers import TrainingArguments diff --git a/examples/legacy/seq2seq/utils.py b/examples/legacy/seq2seq/utils.py index e207e4d0dbd068..2655165cf11adf 100644 --- a/examples/legacy/seq2seq/utils.py +++ b/examples/legacy/seq2seq/utils.py @@ -29,10 +29,10 @@ import torch.distributed as dist from rouge_score import rouge_scorer, scoring from sacrebleu import corpus_bleu +from sentence_splitter import add_newline_to_end_of_each_sentence from torch import nn from torch.utils.data import Dataset, Sampler -from sentence_splitter import add_newline_to_end_of_each_sentence from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer from transformers.models.bart.modeling_bart import shift_tokens_right from transformers.utils import cached_property @@ -132,7 +132,7 @@ def __init__( type_path="train", n_obs=None, prefix="", - **dataset_kwargs + **dataset_kwargs, ): super().__init__() self.src_file = Path(data_dir).joinpath(type_path + ".source") diff --git a/examples/legacy/token-classification/run_ner.py b/examples/legacy/token-classification/run_ner.py index 477ccb50fb2565..212ea986b4245b 100644 --- a/examples/legacy/token-classification/run_ner.py +++ b/examples/legacy/token-classification/run_ner.py @@ -24,6 +24,7 @@ import numpy as np from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score from torch import nn +from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask import transformers from transformers import ( @@ -38,7 +39,6 @@ set_seed, ) from transformers.trainer_utils import is_main_process -from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask logger = logging.getLogger(__name__) diff --git a/examples/legacy/token-classification/run_tf_ner.py b/examples/legacy/token-classification/run_tf_ner.py index 857d777238f2e2..df4770a70fa44d 100755 --- a/examples/legacy/token-classification/run_tf_ner.py +++ b/examples/legacy/token-classification/run_tf_ner.py @@ -24,6 +24,7 @@ import numpy as np from seqeval.metrics import classification_report, f1_score, precision_score, recall_score +from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask from transformers import ( AutoConfig, @@ -35,7 +36,6 @@ TFTrainingArguments, ) from transformers.utils import logging as hf_logging -from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask hf_logging.set_verbosity_info() diff --git a/examples/legacy/token-classification/tasks.py b/examples/legacy/token-classification/tasks.py index 409be0715da321..d893a2ab0347df 100644 --- a/examples/legacy/token-classification/tasks.py +++ b/examples/legacy/token-classification/tasks.py @@ -3,7 +3,6 @@ from typing import List, TextIO, Union from conllu import parse_incr - from utils_ner import InputExample, Split, TokenClassificationTask diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py index 35fcb5ef5b7d22..2b54c7c4a49159 100644 --- a/examples/legacy/token-classification/utils_ner.py +++ b/examples/legacy/token-classification/utils_ner.py @@ -23,6 +23,7 @@ from typing import List, Optional, Union from filelock import FileLock + from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available @@ -240,7 +241,6 @@ def __init__( # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): - if os.path.exists(cached_features_file) and not overwrite_cache: logger.info(f"Loading features from cached file {cached_features_file}") self.features = torch.load(cached_features_file) diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 6ccb5301f43ec3..20ddec4acb9ee1 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -23,10 +23,10 @@ from typing import Optional import datasets +import evaluate import numpy as np from datasets import DatasetDict, load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index db6940fe38283a..78979e41553e0d 100644 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -19,6 +19,7 @@ from dataclasses import dataclass, field from typing import Optional +import evaluate import numpy as np import torch from datasets import load_dataset @@ -33,7 +34,6 @@ ToTensor, ) -import evaluate import transformers from transformers import ( MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 15b27258d2bd56..3ba79d630e76a3 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -21,8 +21,13 @@ from pathlib import Path import datasets +import evaluate import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from torchvision.transforms import ( CenterCrop, @@ -35,12 +40,7 @@ ) from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry from transformers.utils.versions import require_version diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 2a20e2ef45f8ec..ae01b7614e63ba 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -30,10 +30,10 @@ from typing import Optional import datasets +import evaluate import torch from datasets import load_dataset -import evaluate import transformers from transformers import ( CONFIG_MAPPING, diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 1592a72897ee83..998ca60b2700ad 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -33,15 +33,15 @@ import datasets import torch +from accelerate import Accelerator, DistributedType +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm import transformers -from accelerate import Accelerator, DistributedType -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 9639af9513f98c..f44b0e3a01e7de 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -30,9 +30,9 @@ from typing import Optional import datasets +import evaluate from datasets import load_dataset -import evaluate import transformers from transformers import ( CONFIG_MAPPING, diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 12d2bbcb546db7..ee469e48890e74 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -33,15 +33,15 @@ import datasets import torch +from accelerate import Accelerator, DistributedType +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm import transformers -from accelerate import Accelerator, DistributedType -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 6c178e3862af3d..b0bcc567551cc7 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -30,17 +30,17 @@ from typing import Optional, Union import datasets +import evaluate import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index db9b8701f23eda..dfbfe244e206f8 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -25,11 +25,12 @@ from typing import Optional import datasets +import evaluate from datasets import load_dataset +from trainer_qa import QuestionAnsweringTrainer +from utils_qa import postprocess_qa_predictions -import evaluate import transformers -from trainer_qa import QuestionAnsweringTrainer from transformers import ( AutoConfig, AutoModelForQuestionAnswering, @@ -45,7 +46,6 @@ from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 305d32f0ce3527..4d2f5ef51d99c7 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -25,11 +25,12 @@ from typing import Optional import datasets +import evaluate from datasets import load_dataset +from trainer_qa import QuestionAnsweringTrainer +from utils_qa import postprocess_qa_predictions_with_beam_search -import evaluate import transformers -from trainer_qa import QuestionAnsweringTrainer from transformers import ( DataCollatorWithPadding, EvalPrediction, @@ -44,7 +45,6 @@ from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions_with_beam_search # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 25b19825adfe49..9372de3298f2c0 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -27,18 +27,19 @@ from pathlib import Path import datasets +import evaluate import numpy as np import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm +from utils_qa import postprocess_qa_predictions_with_beam_search -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( AdamW, DataCollatorWithPadding, @@ -52,7 +53,6 @@ ) from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions_with_beam_search # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index ffd62fe01ca445..6cbea37151da8e 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -27,18 +27,19 @@ from pathlib import Path import datasets +import evaluate import numpy as np import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm +from utils_qa import postprocess_qa_predictions -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, @@ -53,7 +54,6 @@ ) from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index f742f4c7a44ac3..5fe5c1bddc6cf9 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -25,11 +25,11 @@ from typing import List, Optional, Tuple import datasets +import evaluate from datasets import load_dataset +from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer -import evaluate import transformers -from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer from transformers import ( AutoConfig, AutoModelForSeq2SeqLM, diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 966c2fe48aafad..b1583aca1f0cac 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -21,17 +21,17 @@ from dataclasses import dataclass, field from typing import Optional +import evaluate import numpy as np import torch from datasets import load_dataset +from huggingface_hub import hf_hub_download from PIL import Image from torch import nn from torchvision import transforms from torchvision.transforms import functional -import evaluate import transformers -from huggingface_hub import hf_hub_download from transformers import ( AutoConfig, AutoImageProcessor, diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index e715f20d5fe034..68919e0cc5c57c 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -22,21 +22,21 @@ from pathlib import Path import datasets +import evaluate import numpy as np import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo, hf_hub_download from PIL import Image from torch.utils.data import DataLoader from torchvision import transforms from torchvision.transforms import functional from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo, hf_hub_download from transformers import ( AutoConfig, AutoImageProcessor, diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 3b6acadec56812..603202e696cf9c 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -24,14 +24,14 @@ import datasets import torch +from accelerate import Accelerator +from accelerate.logging import get_logger from datasets import DatasetDict, concatenate_datasets, load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from huggingface_hub import Repository, create_repo from transformers import ( AdamW, SchedulerType, @@ -641,7 +641,6 @@ def prepare_dataset(batch): # update step if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: - # compute grad norm for monitoring scale = ( accelerator.scaler._scale.item() diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index a282f66b8ad349..785609fbd56e88 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -26,11 +26,11 @@ from typing import Dict, List, Optional, Union import datasets +import evaluate import numpy as np import torch from datasets import DatasetDict, load_dataset -import evaluate import transformers from transformers import ( AutoConfig, @@ -708,7 +708,6 @@ def compute_metrics(pred): # Training if training_args.do_train: - # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 3791615bc1981b..c841e99df21557 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -26,10 +26,10 @@ from typing import Any, Dict, List, Optional, Union import datasets +import evaluate import torch from datasets import DatasetDict, load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index d680be320dd55a..b682f89ce5b829 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -25,13 +25,13 @@ from typing import Optional import datasets +import evaluate import nltk # Here to have a nice missing dependency error message early on import numpy as np from datasets import load_dataset +from filelock import FileLock -import evaluate import transformers -from filelock import FileLock from transformers import ( AutoConfig, AutoModelForSeq2SeqLM, diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index e0c4176ec28904..8f669be72c5831 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -27,20 +27,20 @@ from pathlib import Path import datasets +import evaluate import nltk import numpy as np import torch -from datasets import load_dataset -from torch.utils.data import DataLoader -from tqdm.auto import tqdm - -import evaluate -import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import set_seed +from datasets import load_dataset from filelock import FileLock from huggingface_hub import Repository, create_repo +from torch.utils.data import DataLoader +from tqdm.auto import tqdm + +import transformers from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py index 306cd9a34f161a..d88a2ead64b4ae 100644 --- a/examples/pytorch/test_accelerate_examples.py +++ b/examples/pytorch/test_accelerate_examples.py @@ -24,8 +24,8 @@ from unittest import mock import torch - from accelerate.utils import write_basic_config + from transformers.testing_utils import TestCasePlus, get_gpu_count, run_command, slow, torch_device from transformers.utils import is_apex_available diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 6b933bb86014ee..1e7ab534551192 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -24,10 +24,10 @@ from typing import Optional import datasets +import evaluate import numpy as np from datasets import load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 040cb44daa0a20..03de2cf6b553f4 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -22,17 +22,17 @@ from pathlib import Path import datasets +import evaluate import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( AutoConfig, AutoModelForSequenceClassification, diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 6cf3530a87923d..20b38a37cf5bc7 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -25,10 +25,10 @@ from typing import Optional import datasets +import evaluate import numpy as np from datasets import load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index cfced5a1de6e57..065880e7e26e78 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -26,10 +26,10 @@ from typing import Optional import datasets +import evaluate import numpy as np from datasets import ClassLabel, load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index f60ed1fc785d02..ad630472234ea4 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -27,17 +27,17 @@ from pathlib import Path import datasets +import evaluate import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import ClassLabel, load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 80787aba24bc57..cd82c779e8724d 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -25,10 +25,10 @@ from typing import Optional import datasets +import evaluate import numpy as np from datasets import load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index ece3385748da65..a853d531edb861 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -27,18 +27,18 @@ from pathlib import Path import datasets +import evaluate import numpy as np import torch +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed from datasets import load_dataset +from huggingface_hub import Repository, create_repo from torch.utils.data import DataLoader from tqdm.auto import tqdm -import evaluate import transformers -from accelerate import Accelerator -from accelerate.logging import get_logger -from accelerate.utils import set_seed -from huggingface_hub import Repository, create_repo from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, @@ -69,7 +69,6 @@ # Parsing input arguments def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") parser.add_argument( "--dataset_name", @@ -751,5 +750,4 @@ def postprocess_text(preds, labels): if __name__ == "__main__": - main() diff --git a/examples/research_projects/adversarial/run_hans.py b/examples/research_projects/adversarial/run_hans.py index 0576471fbc50a6..3affbb7a69257a 100644 --- a/examples/research_projects/adversarial/run_hans.py +++ b/examples/research_projects/adversarial/run_hans.py @@ -22,6 +22,7 @@ import numpy as np import torch +from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels import transformers from transformers import ( @@ -35,7 +36,6 @@ set_seed, ) from transformers.trainer_utils import is_main_process -from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels logger = logging.getLogger(__name__) diff --git a/examples/research_projects/adversarial/utils_hans.py b/examples/research_projects/adversarial/utils_hans.py index e54792ad2f82b9..f051e60f84fefd 100644 --- a/examples/research_projects/adversarial/utils_hans.py +++ b/examples/research_projects/adversarial/utils_hans.py @@ -20,8 +20,8 @@ from typing import List, Optional, Union import tqdm - from filelock import FileLock + from transformers import ( BartTokenizer, BartTokenizerFast, @@ -134,7 +134,6 @@ def __init__( # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): - if os.path.exists(cached_features_file) and not overwrite_cache: logger.info(f"Loading features from cached file {cached_features_file}") self.features = torch.load(cached_features_file) diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py index d4121655e8233d..aad680f201c520 100755 --- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py +++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py @@ -25,14 +25,14 @@ import numpy as np import torch +from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee +from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee from torch import nn from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange import transformers -from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee -from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee from transformers import ( WEIGHTS_NAME, AdamW, @@ -173,7 +173,6 @@ def train(args, train_dataset, model, tokenizer): for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -263,7 +262,6 @@ def train(args, train_dataset, model, tokenizer): def evaluate(args, model, tokenizer, prefix="", patience=0): - if args.model_type == "albert": model.albert.set_regression_threshold(args.regression_threshold) model.albert.set_patience(patience) @@ -736,7 +734,6 @@ def main(): logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" diff --git a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py index 22c6f4de06f430..6a084d0741d5f5 100644 --- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py +++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py @@ -4,6 +4,7 @@ from unittest.mock import patch import run_glue_with_pabee + from transformers.testing_utils import TestCasePlus diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py index ed2bb11f77b41b..53ba3829b15030 100644 --- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py +++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py @@ -24,9 +24,9 @@ from collections import namedtuple import torch - from model_bertabs import BertAbsSummarizer from models.model_builder import AbsSummarizer # The authors' implementation + from transformers import BertTokenizer diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py index a7d8611a265f0d..33e216f4a08117 100644 --- a/examples/research_projects/bertabs/modeling_bertabs.py +++ b/examples/research_projects/bertabs/modeling_bertabs.py @@ -24,10 +24,10 @@ import numpy as np import torch +from configuration_bertabs import BertAbsConfig from torch import nn from torch.nn.init import xavier_uniform_ -from configuration_bertabs import BertAbsConfig from transformers import BertConfig, BertModel, PreTrainedModel diff --git a/examples/research_projects/bertabs/run_summarization.py b/examples/research_projects/bertabs/run_summarization.py index fcfae6b8c6c755..82ef8ab39ea9b7 100644 --- a/examples/research_projects/bertabs/run_summarization.py +++ b/examples/research_projects/bertabs/run_summarization.py @@ -6,10 +6,10 @@ from collections import namedtuple import torch +from modeling_bertabs import BertAbs, build_predictor from torch.utils.data import DataLoader, SequentialSampler from tqdm import tqdm -from modeling_bertabs import BertAbs, build_predictor from transformers import BertTokenizer from .utils_summarization import ( @@ -45,7 +45,6 @@ def evaluate(args): generated_summaries = [] import nltk - import rouge nltk.download("punkt") diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py index 8fc30b912468ba..927a15f9be679f 100644 --- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py +++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py @@ -3,8 +3,8 @@ import numpy as np from datasets import ClassLabel, DatasetDict, load_dataset - from evaluate import load + from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, diff --git a/examples/research_projects/codeparrot/scripts/bpe_training.py b/examples/research_projects/codeparrot/scripts/bpe_training.py index 8a3d6ee9eec19d..1cbeb4b4ee3240 100644 --- a/examples/research_projects/codeparrot/scripts/bpe_training.py +++ b/examples/research_projects/codeparrot/scripts/bpe_training.py @@ -1,7 +1,7 @@ +from arguments import TokenizerTrainingArguments from datasets import load_dataset from tqdm import tqdm -from arguments import TokenizerTrainingArguments from transformers import AutoTokenizer, HfArgumentParser from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py index b2af8767a217a6..2510e02c94700d 100644 --- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py +++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py @@ -6,16 +6,16 @@ import datasets import torch +from accelerate import Accelerator, DistributedType +from arguments import TrainingArguments from datasets import load_dataset +from huggingface_hub import Repository from torch.optim import AdamW from torch.utils.data import IterableDataset from torch.utils.data.dataloader import DataLoader from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe import transformers -from accelerate import Accelerator, DistributedType -from arguments import TrainingArguments -from huggingface_hub import Repository from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, get_scheduler, set_seed diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py index d0614134ad4732..157079881d5f73 100644 --- a/examples/research_projects/codeparrot/scripts/human_eval.py +++ b/examples/research_projects/codeparrot/scripts/human_eval.py @@ -5,15 +5,15 @@ from collections import defaultdict import torch +from accelerate import Accelerator +from accelerate.utils import set_seed +from arguments import HumanEvalArguments from datasets import load_dataset, load_metric from torch.utils.data import IterableDataset from torch.utils.data.dataloader import DataLoader from tqdm import tqdm import transformers -from accelerate import Accelerator -from accelerate.utils import set_seed -from arguments import HumanEvalArguments from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, StoppingCriteria, StoppingCriteriaList diff --git a/examples/research_projects/codeparrot/scripts/initialize_model.py b/examples/research_projects/codeparrot/scripts/initialize_model.py index 9d066b19087396..6bf028688f1262 100644 --- a/examples/research_projects/codeparrot/scripts/initialize_model.py +++ b/examples/research_projects/codeparrot/scripts/initialize_model.py @@ -1,4 +1,5 @@ from arguments import InitializationArguments + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py index 9e1ef11ff07d15..195a9dc8096b21 100644 --- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py +++ b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py @@ -6,10 +6,9 @@ from typing import Dict, List, Optional, Set, Tuple, Type from datasets import Dataset -from tqdm import tqdm - from datasketch import MinHash, MinHashLSH from dpu_utils.utils.iterators import ThreadedIterator +from tqdm import tqdm NON_ALPHA = re.compile("[^A-Za-z_0-9]") diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py index 6236a8aad86aa1..07540d0b628433 100644 --- a/examples/research_projects/codeparrot/scripts/preprocessing.py +++ b/examples/research_projects/codeparrot/scripts/preprocessing.py @@ -9,10 +9,10 @@ from pathlib import Path import numpy as np -from datasets import load_dataset - from arguments import PreprocessingArguments +from datasets import load_dataset from minhash_deduplication import deduplicate_dataset + from transformers import AutoTokenizer, HfArgumentParser diff --git a/examples/research_projects/codeparrot/scripts/pretokenizing.py b/examples/research_projects/codeparrot/scripts/pretokenizing.py index 9ebe1e577ddefa..5eb793d10d959c 100644 --- a/examples/research_projects/codeparrot/scripts/pretokenizing.py +++ b/examples/research_projects/codeparrot/scripts/pretokenizing.py @@ -1,9 +1,9 @@ import multiprocessing import time +from arguments import PretokenizationArguments from datasets import load_dataset -from arguments import PretokenizationArguments from transformers import AutoTokenizer, HfArgumentParser diff --git a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py index e4438271355707..aaf53de137f490 100644 --- a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py +++ b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py @@ -1,7 +1,6 @@ from unittest import TestCase from datasets import Dataset - from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters diff --git a/examples/research_projects/codeparrot/scripts/validation_loss.py b/examples/research_projects/codeparrot/scripts/validation_loss.py index 280a79dbed0824..929c2df427e227 100644 --- a/examples/research_projects/codeparrot/scripts/validation_loss.py +++ b/examples/research_projects/codeparrot/scripts/validation_loss.py @@ -1,12 +1,12 @@ import logging import torch +from accelerate import Accelerator +from arguments import EvaluationArguments from datasets import load_dataset from torch.utils.data import IterableDataset from torch.utils.data.dataloader import DataLoader -from accelerate import Accelerator -from arguments import EvaluationArguments from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed diff --git a/examples/research_projects/decision_transformer/run_decision_transformer.py b/examples/research_projects/decision_transformer/run_decision_transformer.py index a1e4785d29fc2e..d6c3e28331259d 100644 --- a/examples/research_projects/decision_transformer/run_decision_transformer.py +++ b/examples/research_projects/decision_transformer/run_decision_transformer.py @@ -1,8 +1,8 @@ +import gym import numpy as np import torch - -import gym from mujoco_py import GlfwContext + from transformers import DecisionTransformerModel diff --git a/examples/research_projects/deebert/src/modeling_highway_bert.py b/examples/research_projects/deebert/src/modeling_highway_bert.py index 37d81248ed4550..2a881decbbd529 100644 --- a/examples/research_projects/deebert/src/modeling_highway_bert.py +++ b/examples/research_projects/deebert/src/modeling_highway_bert.py @@ -229,7 +229,10 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ + outputs = ( + sequence_output, + pooled_output, + ) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions), highway exits diff --git a/examples/research_projects/deebert/src/modeling_highway_roberta.py b/examples/research_projects/deebert/src/modeling_highway_roberta.py index c8358ac99454fd..c21fb32fde762a 100644 --- a/examples/research_projects/deebert/src/modeling_highway_roberta.py +++ b/examples/research_projects/deebert/src/modeling_highway_roberta.py @@ -19,7 +19,6 @@ ROBERTA_START_DOCSTRING, ) class DeeRobertaModel(DeeBertModel): - config_class = RobertaConfig base_model_prefix = "roberta" @@ -36,7 +35,6 @@ def __init__(self, config): ROBERTA_START_DOCSTRING, ) class DeeRobertaForSequenceClassification(BertPreTrainedModel): - config_class = RobertaConfig base_model_prefix = "roberta" diff --git a/examples/research_projects/deebert/test_glue_deebert.py b/examples/research_projects/deebert/test_glue_deebert.py index 7a709308e6f716..775c4d70b6523e 100644 --- a/examples/research_projects/deebert/test_glue_deebert.py +++ b/examples/research_projects/deebert/test_glue_deebert.py @@ -4,6 +4,7 @@ from unittest.mock import patch import run_glue_deebert + from transformers.testing_utils import TestCasePlus, get_gpu_count, require_torch_non_multi_gpu, slow @@ -45,7 +46,6 @@ def run_and_check(self, args): @slow @require_torch_non_multi_gpu def test_glue_deebert_train(self): - train_args = """ --model_type roberta --model_name_or_path roberta-base diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py index fc5dc58941f7ba..3ef2ba87b2e211 100644 --- a/examples/research_projects/distillation/distiller.py +++ b/examples/research_projects/distillation/distiller.py @@ -21,14 +21,14 @@ import psutil import torch +from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups +from lm_seqs_dataset import LmSeqsDataset from torch import nn from torch.optim import AdamW from torch.utils.data import BatchSampler, DataLoader, RandomSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm -from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups -from lm_seqs_dataset import LmSeqsDataset from transformers import get_linear_schedule_with_warmup from utils import logger diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py index 3acfd468640626..aba91995da0c3f 100644 --- a/examples/research_projects/distillation/run_squad_w_distillation.py +++ b/examples/research_projects/distillation/run_squad_w_distillation.py @@ -189,7 +189,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None): for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 diff --git a/examples/research_projects/distillation/train.py b/examples/research_projects/distillation/train.py index cc2362888e4725..bb35a1df853943 100644 --- a/examples/research_projects/distillation/train.py +++ b/examples/research_projects/distillation/train.py @@ -24,9 +24,9 @@ import numpy as np import torch - from distiller import Distiller from lm_seqs_dataset import LmSeqsDataset + from transformers import ( BertConfig, BertForMaskedLM, diff --git a/examples/research_projects/information-gain-filtration/igf/igf.py b/examples/research_projects/information-gain-filtration/igf/igf.py index 99bd8c2d06d71c..6861467a33592a 100644 --- a/examples/research_projects/information-gain-filtration/igf/igf.py +++ b/examples/research_projects/information-gain-filtration/igf/igf.py @@ -5,13 +5,13 @@ import logging import random +import joblib import numpy as np import torch import torch.nn as nn from torch.utils.data import DataLoader from tqdm import tqdm -import joblib from transformers import AdamW, GPT2LMHeadModel, get_linear_schedule_with_warmup @@ -119,7 +119,6 @@ def recopy_gpt2(orig_model, device, max_steps): def intermittent_save(contexts, real_perps, past_perps, filename): - """ save the perplexity differences to filename @@ -152,7 +151,6 @@ def collect_objective_set( filename="dev.jbl", recopy_model=recopy_gpt2, ): - """ Collect individual IGF values from pre-trained transformer model max_steps samples of training data to train secondary model @@ -271,7 +269,6 @@ def generate_datasets( def train_secondary_learner( secondary_learner, train_dataset, max_epochs, batch_size, eval_freq=50, igf_model_path="secondary_learner.pt" ): - """ Train the secondary learner (igf_model) diff --git a/examples/research_projects/information-gain-filtration/run_clm_igf.py b/examples/research_projects/information-gain-filtration/run_clm_igf.py index eae10060b22fd1..c1584a2f89adc1 100644 --- a/examples/research_projects/information-gain-filtration/run_clm_igf.py +++ b/examples/research_projects/information-gain-filtration/run_clm_igf.py @@ -28,11 +28,9 @@ import argparse import random +import joblib import numpy as np import torch -from torch.utils.data import DataLoader, RandomSampler - -import joblib from igf.igf import ( SecondaryLearner, collect_objective_set, @@ -43,6 +41,8 @@ set_seed, train_secondary_learner, ) +from torch.utils.data import DataLoader, RandomSampler + from transformers import GPT2LMHeadModel @@ -55,7 +55,6 @@ def generate_n_pairs( data_file="data/tokenized_stories_train_wikitext103.jbl", igf_data_file="igf_context_pairs.jbl", ): - """ Collecting *n* pairs for training the secondary learner Args: diff --git a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py index b9ff9da28140a4..ac37cbc8600a2f 100644 --- a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py +++ b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py @@ -4,8 +4,6 @@ from functools import partial from typing import Callable -from tqdm.auto import tqdm - import flax.linen as nn import jax import jax.numpy as jnp @@ -16,6 +14,8 @@ from flax.serialization import from_bytes, to_bytes from flax.training import train_state from flax.training.common_utils import shard +from tqdm.auto import tqdm + from transformers import BigBirdConfig, FlaxBigBirdForQuestionAnswering from transformers.models.big_bird.modeling_flax_big_bird import FlaxBigBirdForQuestionAnsweringModule @@ -98,7 +98,6 @@ def __post_init__(self): @dataclass class DataCollator: - pad_id: int max_length: int = 4096 # no dynamic padding on TPUs diff --git a/examples/research_projects/jax-projects/big_bird/evaluate.py b/examples/research_projects/jax-projects/big_bird/evaluate.py index e3309f494e34b2..32ca5172a5f25c 100644 --- a/examples/research_projects/jax-projects/big_bird/evaluate.py +++ b/examples/research_projects/jax-projects/big_bird/evaluate.py @@ -1,8 +1,8 @@ -from datasets import load_from_disk - import jax import jax.numpy as jnp from bigbird_flax import FlaxBigBirdForNaturalQuestions +from datasets import load_from_disk + from transformers import BigBirdTokenizerFast diff --git a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py index 8d2f69031e2ab4..22dc3e455024c0 100644 --- a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py +++ b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py @@ -1,10 +1,9 @@ import os +import jsonlines import numpy as np from tqdm import tqdm -import jsonlines - DOC_STRIDE = 2048 MAX_LENGTH = 4096 diff --git a/examples/research_projects/jax-projects/big_bird/train.py b/examples/research_projects/jax-projects/big_bird/train.py index 3d67c9d97f6758..ce37b7f975bb3a 100644 --- a/examples/research_projects/jax-projects/big_bird/train.py +++ b/examples/research_projects/jax-projects/big_bird/train.py @@ -1,12 +1,12 @@ import os from dataclasses import replace -from datasets import load_dataset - import jax import wandb from bigbird_flax import Args, DataCollator, FlaxBigBirdForNaturalQuestions, Trainer, build_tx, train_step, val_step +from datasets import load_dataset from flax import jax_utils + from transformers import BigBirdTokenizerFast diff --git a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py index e4bec5e2886691..3c5bdb7b44507c 100755 --- a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py +++ b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py @@ -32,17 +32,17 @@ from typing import Dict, List, Optional, Tuple import datasets -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import flax import jax import jax.numpy as jnp +import numpy as np import optax +from datasets import load_dataset from flax import jax_utils, traverse_util from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard +from tqdm import tqdm + from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_MASKED_LM_MAPPING, diff --git a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py index a5a395272fdc22..e60f07bdd06325 100644 --- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py +++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py @@ -20,6 +20,7 @@ import jax.numpy as jnp from configuration_hybrid_clip import HybridCLIPConfig from flax.core.frozen_dict import FrozenDict + from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel from transformers.modeling_flax_utils import FlaxPreTrainedModel from transformers.models.clip.modeling_flax_clip import FlaxCLIPOutput @@ -132,7 +133,7 @@ def __init__( input_shape: Optional[Tuple] = None, seed: int = 0, dtype: jnp.dtype = jnp.float32, - **kwargs + **kwargs, ): if input_shape is None: input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3)) diff --git a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py index 1be46f6af99368..f54641408f80a2 100644 --- a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py +++ b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py @@ -32,22 +32,22 @@ from pathlib import Path from typing import Callable, Optional -import torch -from torchvision.datasets import VisionDataset -from torchvision.io import ImageReadMode, read_image -from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize -from torchvision.transforms.functional import InterpolationMode -from tqdm import tqdm - import jax import jax.numpy as jnp import optax -import transformers +import torch from flax import jax_utils from flax.jax_utils import unreplicate from flax.training import train_state from flax.training.common_utils import get_metrics, shard, shard_prng_key from modeling_hybrid_clip import FlaxHybridCLIP +from torchvision.datasets import VisionDataset +from torchvision.io import ImageReadMode, read_image +from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize +from torchvision.transforms.functional import InterpolationMode +from tqdm import tqdm + +import transformers from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index 16eb1007b4c73b..7103b5a28111ff 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -28,19 +28,19 @@ from typing import Callable, Optional import datasets -import numpy as np -from datasets import Dataset, load_dataset -from tqdm import tqdm - import jax import jax.numpy as jnp +import numpy as np import optax -import transformers +from datasets import Dataset, load_dataset from flax.core.frozen_dict import freeze, unfreeze from flax.training.common_utils import onehot, stack_forest from jax.experimental.maps import mesh from jax.experimental.pjit import pjit from partitions import set_partitions +from tqdm import tqdm + +import transformers from transformers import ( CONFIG_MAPPING, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, diff --git a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py index 71bf60d2c6027d..5034e1ee9137a2 100755 --- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py +++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py @@ -6,18 +6,18 @@ from pathlib import Path from typing import Dict, List, Optional, Union -import numpy as np -from datasets import DatasetDict, load_dataset -from tqdm import tqdm - import flax import jax import jax.numpy as jnp import librosa +import numpy as np import optax +from datasets import DatasetDict, load_dataset from flax import jax_utils, traverse_util from flax.training import train_state from flax.training.common_utils import get_metrics, onehot, shard +from tqdm import tqdm + from transformers import ( FlaxWav2Vec2ForPreTraining, HfArgumentParser, diff --git a/examples/research_projects/longform-qa/eli5_app.py b/examples/research_projects/longform-qa/eli5_app.py index 7782d6433ba7c5..1bcb6fd20d25fc 100644 --- a/examples/research_projects/longform-qa/eli5_app.py +++ b/examples/research_projects/longform-qa/eli5_app.py @@ -1,11 +1,9 @@ import datasets +import faiss import numpy as np import streamlit as st import torch from elasticsearch import Elasticsearch - -import faiss -import transformers from eli5_utils import ( embed_questions_for_retrieval, make_qa_s2s_model, @@ -13,6 +11,8 @@ query_es_index, query_qa_dense_index, ) + +import transformers from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py index 82c4bd8caf20d3..db4eae66041be4 100644 --- a/examples/research_projects/longform-qa/eli5_utils.py +++ b/examples/research_projects/longform-qa/eli5_utils.py @@ -5,6 +5,7 @@ from time import time import datasets # noqa: F401 +import faiss # noqa: F401 import numpy as np import pandas as pd import torch @@ -15,7 +16,6 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from tqdm import tqdm -import faiss # noqa: F401 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py index cb81402425ff2d..4c5227d2c7e011 100644 --- a/examples/research_projects/luke/run_luke_ner_no_trainer.py +++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py @@ -27,14 +27,14 @@ import datasets import torch +from accelerate import Accelerator, DistributedDataParallelKwargs from datasets import ClassLabel, load_dataset, load_metric +from huggingface_hub import Repository +from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor from torch.utils.data import DataLoader from tqdm.auto import tqdm import transformers -from accelerate import Accelerator, DistributedDataParallelKwargs -from huggingface_hub import Repository -from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor from transformers import ( AdamW, LukeConfig, diff --git a/examples/research_projects/lxmert/extracting_data.py b/examples/research_projects/lxmert/extracting_data.py index 9790e20ad86bf9..9c445be336f553 100644 --- a/examples/research_projects/lxmert/extracting_data.py +++ b/examples/research_projects/lxmert/extracting_data.py @@ -9,9 +9,9 @@ import datasets import numpy as np import torch - from modeling_frcnn import GeneralizedRCNN from processing_image import Preprocess + from utils import Config diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py index 33c1133e9589f4..08758b1d3cac06 100644 --- a/examples/research_projects/lxmert/modeling_frcnn.py +++ b/examples/research_projects/lxmert/modeling_frcnn.py @@ -169,7 +169,6 @@ def get_norm(norm, out_channels): def _create_grid_offsets(size: List[int], stride: int, offset: float, device): - grid_height, grid_width = size shifts_x = torch.arange( offset * stride, @@ -390,7 +389,6 @@ def assign_boxes_to_levels( canonical_box_size: int, canonical_level: int, ): - box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists])) # Eqn.(1) in FPN paper level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)) @@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): - assert from_tf, ( - "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint" - .format(pretrained_model_name_or_path + ".index") + assert ( + from_tf + ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( + pretrained_model_name_or_path + ".index" ) archive_file = pretrained_model_name_or_path + ".index" else: diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py index 8e830fb8359d29..2fc6ea2062efd2 100644 --- a/examples/research_projects/lxmert/utils.py +++ b/examples/research_projects/lxmert/utils.py @@ -34,14 +34,13 @@ from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile -import numpy as np -from PIL import Image -from tqdm.auto import tqdm - import cv2 +import numpy as np import requests import wget from filelock import FileLock +from PIL import Image +from tqdm.auto import tqdm from yaml import Loader, dump, load @@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): @classmethod def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs): - cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) @@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs): # quick compare tensors def compare(in_tensor): - out_tensor = torch.load("dump.pt", map_location=in_tensor.device) n1 = in_tensor.numpy() n2 = out_tensor.numpy()[0] print(n1.shape, n1[0, 0, :5]) print(n2.shape, n2[0, 0, :5]) assert np.allclose(n1, n2, rtol=0.01, atol=0.1), ( - f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %" + f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %" " element-wise mismatch" ) raise Exception("tensors are all good") @@ -300,7 +297,6 @@ def get_from_cache( user_agent=None, local_files_only=False, ): - if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): @@ -355,7 +351,6 @@ def get_from_cache( # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): - # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. @@ -406,7 +401,6 @@ def _resumable_file_manager(): def url_to_filename(url, etag=None): - url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() diff --git a/examples/research_projects/lxmert/visualizing_image.py b/examples/research_projects/lxmert/visualizing_image.py index a02dc66dfb7c61..163d661e873ec3 100644 --- a/examples/research_projects/lxmert/visualizing_image.py +++ b/examples/research_projects/lxmert/visualizing_image.py @@ -18,6 +18,7 @@ import colorsys import io +import cv2 import matplotlib as mpl import matplotlib.colors as mplc import matplotlib.figure as mplfigure @@ -25,7 +26,6 @@ import torch from matplotlib.backends.backend_agg import FigureCanvasAgg -import cv2 from utils import img_tensorize diff --git a/examples/research_projects/mlm_wwm/run_chinese_ref.py b/examples/research_projects/mlm_wwm/run_chinese_ref.py index 4d1c9e81e94ac3..eca89df97982da 100644 --- a/examples/research_projects/mlm_wwm/run_chinese_ref.py +++ b/examples/research_projects/mlm_wwm/run_chinese_ref.py @@ -3,6 +3,7 @@ from typing import List from ltp import LTP + from transformers.models.bert.tokenization_bert import BertTokenizer diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py index 9f12257a10a8cb..23b2a65e5c96ac 100644 --- a/examples/research_projects/mm-imdb/run_mmimdb.py +++ b/examples/research_projects/mm-imdb/run_mmimdb.py @@ -30,6 +30,7 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange +from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels import transformers from transformers import ( @@ -43,7 +44,6 @@ get_linear_schedule_with_warmup, ) from transformers.trainer_utils import is_main_process -from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels try: diff --git a/examples/research_projects/movement-pruning/bertarize.py b/examples/research_projects/movement-pruning/bertarize.py index 623b46b94386fd..0c9cc63571d7c1 100644 --- a/examples/research_projects/movement-pruning/bertarize.py +++ b/examples/research_projects/movement-pruning/bertarize.py @@ -22,7 +22,6 @@ import shutil import torch - from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer diff --git a/examples/research_projects/movement-pruning/counts_parameters.py b/examples/research_projects/movement-pruning/counts_parameters.py index 0aec3766b3f95c..17ddb029f89780 100644 --- a/examples/research_projects/movement-pruning/counts_parameters.py +++ b/examples/research_projects/movement-pruning/counts_parameters.py @@ -19,7 +19,6 @@ import os import torch - from emmental.modules import ThresholdBinarizer, TopKBinarizer diff --git a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py index 66d78b0c8fdc19..2a3bd763a2de36 100644 --- a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py +++ b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py @@ -50,7 +50,7 @@ def __init__( pruning_method="topK", mask_init="constant", mask_scale=0.0, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py index 4228050fe123b3..d404bf49aaa62d 100644 --- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py +++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py @@ -649,7 +649,10 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[ + outputs = ( + sequence_output, + pooled_output, + ) + encoder_outputs[ 1: ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py index e81cf9209c889d..4ce56e524f714b 100644 --- a/examples/research_projects/movement-pruning/masked_run_glue.py +++ b/examples/research_projects/movement-pruning/masked_run_glue.py @@ -24,12 +24,12 @@ import numpy as np import torch +from emmental import MaskedBertConfig, MaskedBertForSequenceClassification from torch import nn from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange -from emmental import MaskedBertConfig, MaskedBertForSequenceClassification from transformers import ( WEIGHTS_NAME, AdamW, @@ -228,7 +228,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None): for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py index 1bd501eda51440..a516bb8d585ddd 100644 --- a/examples/research_projects/movement-pruning/masked_run_squad.py +++ b/examples/research_projects/movement-pruning/masked_run_squad.py @@ -25,12 +25,12 @@ import numpy as np import torch +from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering from torch import nn from torch.utils.data import DataLoader, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange -from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering from transformers import ( WEIGHTS_NAME, AdamW, @@ -236,7 +236,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None): for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 diff --git a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py index 8f4760580fd9ba..5c1b0da700024b 100644 --- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py +++ b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py @@ -264,7 +264,6 @@ def greedy_search( past: List[torch.Tensor] = [] while cur_len < max_length: - logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past) next_token_logits = logits[:, -1, :] @@ -303,7 +302,6 @@ def _prepare_decoder_input_ids_for_generation( decoder_start_token_id, bos_token_id: Optional[int] = None, ) -> torch.LongTensor: - decoder_input_ids = ( torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device) * decoder_start_token_id @@ -633,7 +631,6 @@ def _reorder_cache(self, past: List[torch.Tensor], beam_idx): def beam_search( self, input_ids, encoder_output, attention_mask, num_beams, max_length, pad_token_id: int, eos_token_id: int ): - batch_size = self.beam_scorer.batch_size num_beams = self.beam_scorer.num_beams diff --git a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py index 63fae44ffac6bc..d327cdb2841d9c 100644 --- a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py +++ b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py @@ -5,7 +5,6 @@ import os import numpy - import onnx diff --git a/examples/research_projects/onnx/summarization/run_onnx_exporter.py b/examples/research_projects/onnx/summarization/run_onnx_exporter.py index 5d751ace8eee10..889eefb4e74b56 100644 --- a/examples/research_projects/onnx/summarization/run_onnx_exporter.py +++ b/examples/research_projects/onnx/summarization/run_onnx_exporter.py @@ -22,12 +22,12 @@ import sys import numpy as np -import torch - import onnxruntime -import transformers +import torch from bart_onnx.generation_onnx import BARTBeamSearchGenerator from bart_onnx.reduce_onnx_size import remove_dup_initializers + +import transformers from transformers import BartForConditionalGeneration, BartTokenizer diff --git a/examples/research_projects/performer/modeling_flax_performer.py b/examples/research_projects/performer/modeling_flax_performer.py index b4b9924fae2716..7c2fde6ddbb5dc 100644 --- a/examples/research_projects/performer/modeling_flax_performer.py +++ b/examples/research_projects/performer/modeling_flax_performer.py @@ -15,13 +15,13 @@ from typing import Callable, Dict, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from jax.random import PRNGKey from modeling_flax_performer_utils import make_fast_softmax_attention + from transformers.file_utils import add_start_docstrings from transformers.modeling_flax_utils import ACT2FN from transformers.models.bert.configuration_bert import BertConfig @@ -366,7 +366,6 @@ def convert_from_pytorch(pt_state: Dict, config: BertConfig) -> Dict: # SelfAttention needs also to replace "weight" by "kernel" if {"query", "key", "value"} & key_parts: - # Flax SelfAttention decomposes the heads (num_head, size // num_heads) if "bias" in key: jax_state[key] = tensor.reshape((config.num_attention_heads, -1)) @@ -443,7 +442,6 @@ def module(self) -> nn.Module: def __call__( self, input_ids, token_type_ids=None, position_ids=None, dropout_rng: PRNGKey = None, attention_mask=None ): - input_ids, attention_mask, token_type_ids, position_ids = self._check_inputs( input_ids, attention_mask, token_type_ids, position_ids ) diff --git a/examples/research_projects/performer/modeling_flax_performer_utils.py b/examples/research_projects/performer/modeling_flax_performer_utils.py index 915e2fa23dd98f..6e6173729cc348 100644 --- a/examples/research_projects/performer/modeling_flax_performer_utils.py +++ b/examples/research_projects/performer/modeling_flax_performer_utils.py @@ -30,11 +30,10 @@ import functools from collections.abc import Iterable # pylint: disable=g-importing-member -import numpy as onp -from absl import logging - import jax import jax.numpy as jnp +import numpy as onp +from absl import logging from jax import lax, random @@ -524,7 +523,6 @@ def dot_product_attention( deterministic=False, precision=None, ): - assert key.shape[:-1] == value.shape[:-1] assert query.shape[0:1] == key.shape[0:1] and query.shape[-1] == key.shape[-1] if axis is None: diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py index 35de233f727ea4..1547ead421fd6f 100644 --- a/examples/research_projects/performer/run_mlm_performer.py +++ b/examples/research_projects/performer/run_mlm_performer.py @@ -28,18 +28,18 @@ from pathlib import Path from typing import Dict, List, Optional, Tuple -import numpy as np -from datasets import load_dataset -from tqdm import tqdm - import jax import jax.numpy as jnp +import numpy as np +from datasets import load_dataset from flax import jax_utils from flax.optim import Adam from flax.training import common_utils from flax.training.common_utils import get_metrics from jax.nn import log_softmax from modeling_flax_performer import FlaxPerformerForMaskedLM +from tqdm import tqdm + from transformers import ( MODEL_FOR_MASKED_LM_MAPPING, AutoTokenizer, @@ -632,7 +632,6 @@ def tokenize_function(examples): epochs = tqdm(range(nb_epochs), desc=f"Epoch ... (1/{nb_epochs})", position=0) for epoch in epochs: - # ======================== Training ================================ # Create sampling rng rng, training_rng, eval_rng = jax.random.split(rng, 3) diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py index fdbad607201b17..54784b944c71cf 100644 --- a/examples/research_projects/pplm/run_pplm.py +++ b/examples/research_projects/pplm/run_pplm.py @@ -30,10 +30,10 @@ import numpy as np import torch +from pplm_classification_head import ClassificationHead from torch import nn from tqdm import trange -from pplm_classification_head import ClassificationHead from transformers import GPT2LMHeadModel, GPT2Tokenizer from transformers.file_utils import cached_path @@ -345,7 +345,7 @@ def full_text_generation( gm_scale=0.9, kl_scale=0.01, repetition_penalty=1.0, - **kwargs + **kwargs, ): classifier, class_id = get_classifier(discrim, class_label, device) @@ -463,7 +463,6 @@ def generate_text_pplm( unpert_discrim_loss = 0 loss_in_time = [] for i in trange(length, ascii=True): - # Get past/probs for current output, except for last word # Note that GPT takes 2 inputs: past + current_token @@ -547,7 +546,6 @@ def generate_text_pplm( # Fuse the modified model and original model if perturb: - unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1) pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py index 6a7351d9e6a63a..d53b557d1af031 100644 --- a/examples/research_projects/pplm/run_pplm_discrim_train.py +++ b/examples/research_projects/pplm/run_pplm_discrim_train.py @@ -26,12 +26,12 @@ import torch.optim as optim import torch.utils.data as data from nltk.tokenize.treebank import TreebankWordDetokenizer +from pplm_classification_head import ClassificationHead from torch import nn from torchtext import data as torchtext_data from torchtext import datasets from tqdm import tqdm, trange -from pplm_classification_head import ClassificationHead from transformers import GPT2LMHeadModel, GPT2Tokenizer diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py index bd0b1157b01d47..814f95d0ab8f79 100755 --- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py +++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py @@ -21,19 +21,19 @@ import datasets import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt import torch from absl import logging as absl_logging +from accelerate import Accelerator from datasets import load_dataset, load_metric from torch.utils.data import DataLoader +from utils_qa import postprocess_qa_predictions -import pycuda.autoinit # noqa: F401 -import pycuda.driver as cuda -import tensorrt as trt import transformers -from accelerate import Accelerator from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed from transformers.trainer_pt_utils import nested_concat, nested_truncate -from utils_qa import postprocess_qa_predictions TRT_LOGGER = trt.Logger(trt.Logger.WARNING) @@ -395,7 +395,6 @@ def post_processing_function(examples, features, predictions, stage="eval"): with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine( f.read() ) as engine, engine.create_execution_context() as context: - # setup for TRT inferrence for i in range(len(input_names)): context.set_binding_shape(i, INPUT_SHAPE) @@ -427,7 +426,6 @@ def binding_nbytes(binding): all_preds = None for step, batch in enumerate(eval_dataloader): - outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream) total_time += infer_time niter += 1 diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py index 4ed4203062c01c..bb0436c125800b 100644 --- a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py +++ b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py @@ -2,7 +2,6 @@ import time import numpy as np - import onnxruntime as ort diff --git a/examples/research_projects/quantization-qdqbert/quant_trainer.py b/examples/research_projects/quantization-qdqbert/quant_trainer.py index ce1ecb6c51feac..9360cc01ba7fa0 100755 --- a/examples/research_projects/quantization-qdqbert/quant_trainer.py +++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py @@ -16,10 +16,9 @@ import logging import re -import torch - import pytorch_quantization import pytorch_quantization.nn as quant_nn +import torch from pytorch_quantization import calib from pytorch_quantization.tensor_quant import QuantDescriptor diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py index 5008197b8b845d..ba5dfe4c090736 100755 --- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py +++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py @@ -26,11 +26,12 @@ from typing import Optional import datasets +import quant_trainer from datasets import load_dataset, load_metric +from trainer_quant_qa import QuestionAnsweringTrainer +from utils_qa import postprocess_qa_predictions -import quant_trainer import transformers -from trainer_quant_qa import QuestionAnsweringTrainer from transformers import ( AutoTokenizer, DataCollatorWithPadding, @@ -46,7 +47,6 @@ from transformers.trainer_utils import SchedulerType, get_last_checkpoint from transformers.utils import check_min_version from transformers.utils.versions import require_version -from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py index ef0d93a7e357cc..9b8c53b272b11b 100644 --- a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py +++ b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py @@ -20,10 +20,10 @@ import logging import os +import quant_trainer import torch from torch.utils.data import DataLoader -import quant_trainer from transformers import Trainer, is_torch_tpu_available from transformers.trainer_utils import PredictionOutput diff --git a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py index 5f18244a7aa481..09a30ff6d5c433 100644 --- a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py +++ b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py @@ -6,7 +6,6 @@ import torch from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.utilities import rank_zero_only - from utils_rag import save_json diff --git a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py index 50842f062c997c..f97467292c25bf 100644 --- a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py +++ b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py @@ -2,6 +2,7 @@ import random import ray + from transformers import RagConfig, RagRetriever, RagTokenizer from transformers.models.rag.retrieval_rag import CustomHFIndex @@ -166,7 +167,6 @@ def from_pretrained(cls, retriever_name_or_path, actor_handles, indexed_dataset= ) def re_load(self): - logger.info("re-loading the new dataset with embeddings") # access from the training loop diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag.py b/examples/research_projects/rag-end2end-retriever/finetune_rag.py index 53e1e657f7f1da..8d0ba293b12b90 100644 --- a/examples/research_projects/rag-end2end-retriever/finetune_rag.py +++ b/examples/research_projects/rag-end2end-retriever/finetune_rag.py @@ -252,14 +252,12 @@ def pad(self) -> int: raise NotImplementedError("pad not implemented") def training_step(self, batch, batch_idx) -> Dict: - global isEmUpdateBusy # use to check whether the entire embedding update process is finished or not global isAddIndexBusy # use to check whether the entire indexing process is finished or not global processes # use to keep threads embedding update processes global threadHandle_index # use to keep thread in embedding indexing processes if (self.trainer.global_rank == 0) and (self.custom_config.end2end): - if (not batch_idx == 0) and (batch_idx % self.custom_config.indexing_freq == 0): free_gpu_list = [] nvmlInit() @@ -282,7 +280,6 @@ def training_step(self, batch, batch_idx) -> Dict: has_free_gpus = False if (not isEmUpdateBusy) and has_free_gpus: - model_copy = type(self.model.rag.ctx_encoder)( self.config_dpr ) # get a new instance #this will be load in the CPU @@ -336,10 +333,8 @@ def training_step(self, batch, batch_idx) -> Dict: # check when index building has started if isAddIndexBusy: - # check still the index_building process is happening if not threadHandle_index.is_alive(): - logger.info("Merging the dataset shards") saved_dataset_shards = [] @@ -494,7 +489,6 @@ def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: self.tokenizer.save_pretrained(save_path) if self.custom_config.end2end: - modified_state_dict = self.model.state_dict() for key in self.model.state_dict().keys(): if key.split(".")[1] == "ctx_encoder": @@ -803,7 +797,6 @@ def main(args=None, model=None) -> GenerativeQAModule: if __name__ == "__main__": - multiprocessing.set_start_method("spawn") parser = argparse.ArgumentParser() parser = pl.Trainer.add_argparse_args(parser) diff --git a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py index 25fa737e5aa3c5..444c07b2bab16a 100644 --- a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py +++ b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py @@ -2,9 +2,9 @@ from functools import partial from glob import glob +import faiss from datasets import Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk -import faiss from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast @@ -26,7 +26,6 @@ def split_documents(documents): def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path): - kb_dataset = load_dataset( "csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"] ) diff --git a/examples/research_projects/rag-end2end-retriever/lightning_base.py b/examples/research_projects/rag-end2end-retriever/lightning_base.py index 84842944059a7c..b9f8c6e3d7b5c0 100644 --- a/examples/research_projects/rag-end2end-retriever/lightning_base.py +++ b/examples/research_projects/rag-end2end-retriever/lightning_base.py @@ -69,7 +69,7 @@ def __init__( config=None, tokenizer=None, model=None, - **config_kwargs + **config_kwargs, ): """Initialize a model, tokenizer and config.""" super().__init__() @@ -365,7 +365,7 @@ def generic_train( extra_callbacks=[], checkpoint_callback=None, logging_callback=None, - **extra_train_kwargs + **extra_train_kwargs, ): pl.seed_everything(args.seed) diff --git a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py index 432111a2784c37..e0aa86a3a65ba9 100644 --- a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py +++ b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py @@ -6,10 +6,10 @@ from tempfile import TemporaryDirectory from typing import List, Optional +import faiss import torch from datasets import Features, Sequence, Value, load_dataset -import faiss from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast, HfArgumentParser @@ -49,7 +49,6 @@ def main( processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): - ###################################### logger.info("Step 1 - Create the dataset") ###################################### diff --git a/examples/research_projects/rag/_test_finetune_rag.py b/examples/research_projects/rag/_test_finetune_rag.py index fa535f2268be34..0906295b301824 100644 --- a/examples/research_projects/rag/_test_finetune_rag.py +++ b/examples/research_projects/rag/_test_finetune_rag.py @@ -5,6 +5,7 @@ from pathlib import Path import finetune_rag + from transformers.file_utils import is_apex_available from transformers.testing_utils import ( TestCasePlus, diff --git a/examples/research_projects/rag/callbacks_rag.py b/examples/research_projects/rag/callbacks_rag.py index af1595b08efdf6..d75f97995bd16f 100644 --- a/examples/research_projects/rag/callbacks_rag.py +++ b/examples/research_projects/rag/callbacks_rag.py @@ -6,7 +6,6 @@ import torch from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint from pytorch_lightning.utilities import rank_zero_only - from utils_rag import save_json diff --git a/examples/research_projects/rag/consolidate_rag_checkpoint.py b/examples/research_projects/rag/consolidate_rag_checkpoint.py index 39ba7e91f6c3a6..6adae75fea9b12 100644 --- a/examples/research_projects/rag/consolidate_rag_checkpoint.py +++ b/examples/research_projects/rag/consolidate_rag_checkpoint.py @@ -17,7 +17,6 @@ def consolidate( generator_tokenizer_name_or_path: str = None, question_encoder_tokenizer_name_or_path: str = None, ): - if config_name_or_path is None: config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base" diff --git a/examples/research_projects/rag/distributed_ray_retriever.py b/examples/research_projects/rag/distributed_ray_retriever.py index 9ffc1b1e3845cd..dd5baaf726116f 100644 --- a/examples/research_projects/rag/distributed_ray_retriever.py +++ b/examples/research_projects/rag/distributed_ray_retriever.py @@ -2,6 +2,7 @@ import random import ray + from transformers import RagConfig, RagRetriever, RagTokenizer from transformers.models.rag.retrieval_rag import CustomHFIndex diff --git a/examples/research_projects/rag/lightning_base.py b/examples/research_projects/rag/lightning_base.py index 77830a4760ad39..e78a7582395875 100644 --- a/examples/research_projects/rag/lightning_base.py +++ b/examples/research_projects/rag/lightning_base.py @@ -69,7 +69,7 @@ def __init__( config=None, tokenizer=None, model=None, - **config_kwargs + **config_kwargs, ): """Initialize a model, tokenizer and config.""" super().__init__() @@ -356,7 +356,7 @@ def generic_train( extra_callbacks=[], checkpoint_callback=None, logging_callback=None, - **extra_train_kwargs + **extra_train_kwargs, ): pl.seed_everything(args.seed) diff --git a/examples/research_projects/rag/test_distributed_retriever.py b/examples/research_projects/rag/test_distributed_retriever.py index ac54d1f9857f1a..7e75e0a7a7efcc 100644 --- a/examples/research_projects/rag/test_distributed_retriever.py +++ b/examples/research_projects/rag/test_distributed_retriever.py @@ -7,10 +7,10 @@ from unittest import TestCase from unittest.mock import patch +import faiss import numpy as np from datasets import Dataset -import faiss from transformers import BartConfig, BartTokenizer, DPRConfig, DPRQuestionEncoderTokenizer, RagConfig from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available from transformers.integrations import is_ray_available diff --git a/examples/research_projects/rag/use_own_knowledge_dataset.py b/examples/research_projects/rag/use_own_knowledge_dataset.py index dc08f508228abc..84d7c854975f11 100644 --- a/examples/research_projects/rag/use_own_knowledge_dataset.py +++ b/examples/research_projects/rag/use_own_knowledge_dataset.py @@ -6,10 +6,10 @@ from tempfile import TemporaryDirectory from typing import List, Optional +import faiss import torch from datasets import Features, Sequence, Value, load_dataset -import faiss from transformers import ( DPRContextEncoder, DPRContextEncoderTokenizerFast, @@ -56,7 +56,6 @@ def main( processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): - ###################################### logger.info("Step 1 - Create the dataset") ###################################### diff --git a/examples/research_projects/robust-speech-event/eval.py b/examples/research_projects/robust-speech-event/eval.py index 32e3d1f2c729f8..a8acca1825d7da 100755 --- a/examples/research_projects/robust-speech-event/eval.py +++ b/examples/research_projects/robust-speech-event/eval.py @@ -36,7 +36,6 @@ def log_results(result: Dataset, args: Dict[str, str]): target_file = f"log_{dataset_id}_targets.txt" with open(pred_file, "w") as p, open(target_file, "w") as t: - # mapping function to write output def write_to_file(batch, i): p.write(f"{i}" + "\n") diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py index d58e17dd25c2ad..aaacc79cebd7a6 100755 --- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py +++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py @@ -25,12 +25,12 @@ from dataclasses import dataclass, field from typing import Dict, List, Optional, Union +import bitsandbytes as bnb import datasets import numpy as np import torch from datasets import DatasetDict, load_dataset, load_metric -import bitsandbytes as bnb import transformers from transformers import ( AutoConfig, @@ -717,7 +717,6 @@ def compute_metrics(pred): # Training if training_args.do_train: - # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py index ef2529fb09b238..54338f15988154 100644 --- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py +++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py @@ -622,7 +622,6 @@ def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs): # Training if training_args.do_train: - # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint diff --git a/examples/research_projects/self-training-text-classification/selftraining.py b/examples/research_projects/self-training-text-classification/selftraining.py index 7fde2fd1b8ed1e..70a6c2f319e0cb 100644 --- a/examples/research_projects/self-training-text-classification/selftraining.py +++ b/examples/research_projects/self-training-text-classification/selftraining.py @@ -23,12 +23,12 @@ from typing import List, Optional import datasets +from accelerate import Accelerator from datasets import load_dataset +from finetuning import finetune from tqdm.auto import tqdm import transformers -from accelerate import Accelerator -from finetuning import finetune from transformers import AutoConfig, set_seed from transformers.trainer_utils import IntervalStrategy diff --git a/examples/research_projects/seq2seq-distillation/_test_bash_script.py b/examples/research_projects/seq2seq-distillation/_test_bash_script.py index 53922f2b645bbc..fa84a60c0c88e0 100644 --- a/examples/research_projects/seq2seq-distillation/_test_bash_script.py +++ b/examples/research_projects/seq2seq-distillation/_test_bash_script.py @@ -8,9 +8,9 @@ import pytorch_lightning as pl import timeout_decorator import torch - from distillation import SummarizationDistiller, distill_main from finetune import SummarizationModule, main + from transformers import MarianMTModel from transformers.file_utils import cached_path from transformers.testing_utils import TestCasePlus, require_torch_gpu, slow diff --git a/examples/research_projects/seq2seq-distillation/_test_make_student.py b/examples/research_projects/seq2seq-distillation/_test_make_student.py index 0a1688a95cc11e..73df66315cbd79 100644 --- a/examples/research_projects/seq2seq-distillation/_test_make_student.py +++ b/examples/research_projects/seq2seq-distillation/_test_make_student.py @@ -2,6 +2,7 @@ import unittest from make_student import create_student_by_copying_alternating_layers + from transformers import AutoConfig from transformers.file_utils import cached_property from transformers.testing_utils import require_torch diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py index d97c9d43b3330d..b1c84ad9b8bdc3 100644 --- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py +++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py @@ -5,18 +5,18 @@ import tempfile from pathlib import Path +import lightning_base import pytest import pytorch_lightning as pl import torch -from torch import nn - -import lightning_base from convert_pl_checkpoint_to_hf import convert_pl_to_hf from distillation import distill_main from finetune import SummarizationModule, main from huggingface_hub import list_models from parameterized import parameterized from run_eval import generate_summaries_or_translations +from torch import nn + from transformers import AutoConfig, AutoModelForSeq2SeqLM from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow from utils import label_smoothed_nll_loss, lmap, load_json diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py index af6ae24bf4c349..bb06ec8e659fa7 100644 --- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py +++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py @@ -98,7 +98,6 @@ def setUpClass(cls): @require_torch_multi_gpu def test_multi_gpu(self): - updates = dict( no_teacher=True, freeze_encoder=True, diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py index 78ff49718bb51a..323f62bf45812e 100755 --- a/examples/research_projects/seq2seq-distillation/distillation.py +++ b/examples/research_projects/seq2seq-distillation/distillation.py @@ -9,11 +9,11 @@ import pytorch_lightning as pl import torch -from torch import nn - from finetune import SummarizationModule, TranslationModule from finetune import main as ft_main from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise +from torch import nn + from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration from transformers.models.bart.modeling_bart import shift_tokens_right from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py index c20b361d583631..77f02bef135ed3 100755 --- a/examples/research_projects/seq2seq-distillation/finetune.py +++ b/examples/research_projects/seq2seq-distillation/finetune.py @@ -13,10 +13,10 @@ import numpy as np import pytorch_lightning as pl import torch +from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback from torch import nn from torch.utils.data import DataLoader -from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback from transformers import MBartTokenizer, T5ForConditionalGeneration from transformers.models.bart.modeling_bart import shift_tokens_right from utils import ( diff --git a/examples/research_projects/seq2seq-distillation/lightning_base.py b/examples/research_projects/seq2seq-distillation/lightning_base.py index b3104a25a8b129..f246ecab0dd01b 100644 --- a/examples/research_projects/seq2seq-distillation/lightning_base.py +++ b/examples/research_projects/seq2seq-distillation/lightning_base.py @@ -69,7 +69,7 @@ def __init__( config=None, tokenizer=None, model=None, - **config_kwargs + **config_kwargs, ): """Initialize a model, tokenizer and config.""" super().__init__() @@ -346,7 +346,7 @@ def generic_train( extra_callbacks=[], checkpoint_callback=None, logging_callback=None, - **extra_train_kwargs + **extra_train_kwargs, ): pl.seed_everything(args.seed) diff --git a/examples/research_projects/seq2seq-distillation/make_student.py b/examples/research_projects/seq2seq-distillation/make_student.py index a4021505b998e0..c1efc1b497abba 100644 --- a/examples/research_projects/seq2seq-distillation/make_student.py +++ b/examples/research_projects/seq2seq-distillation/make_student.py @@ -84,7 +84,7 @@ def create_student_by_copying_alternating_layers( copy_first_teacher_layers=False, e_layers_to_copy=None, d_layers_to_copy=None, - **extra_config_kwargs + **extra_config_kwargs, ) -> Tuple[PreTrainedModel, List[int], List[int]]: """Make a student by copying alternating layers from a teacher, save it to save_path. Args: @@ -107,7 +107,6 @@ def create_student_by_copying_alternating_layers( AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path) # purely for convenience teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval() else: - assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}" init_kwargs = teacher.config.to_diff_dict() diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py index a45194e6e054bf..f1a8cef8508ccd 100644 --- a/examples/research_projects/seq2seq-distillation/utils.py +++ b/examples/research_projects/seq2seq-distillation/utils.py @@ -15,10 +15,10 @@ import torch.distributed as dist from rouge_score import rouge_scorer, scoring from sacrebleu import corpus_bleu +from sentence_splitter import add_newline_to_end_of_each_sentence from torch import nn from torch.utils.data import Dataset, Sampler -from sentence_splitter import add_newline_to_end_of_each_sentence from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer from transformers.file_utils import cached_property from transformers.models.bart.modeling_bart import shift_tokens_right @@ -115,7 +115,7 @@ def __init__( type_path="train", n_obs=None, prefix="", - **dataset_kwargs + **dataset_kwargs, ): super().__init__() self.src_file = Path(data_dir).joinpath(type_path + ".source") diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py index 1d402fa7e8f0e9..a5717d245cb6c9 100644 --- a/examples/research_projects/tapex/run_wikisql_with_tapex.py +++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py @@ -32,9 +32,10 @@ import numpy as np import pandas as pd from datasets import load_dataset +from filelock import FileLock +from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas import transformers -from filelock import FileLock from transformers import ( AutoConfig, BartForConditionalGeneration, @@ -48,7 +49,6 @@ from transformers.file_utils import is_offline_mode from transformers.trainer_utils import get_last_checkpoint, is_main_process from transformers.utils import check_min_version -from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas # Will error if the minimal version of Transformers is not installed. Remove at your own risks. diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py index 6f93f9b5166929..901e921f26a694 100644 --- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py +++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py @@ -31,9 +31,9 @@ import numpy as np import pandas as pd from datasets import load_dataset +from filelock import FileLock import transformers -from filelock import FileLock from transformers import ( AutoConfig, BartForConditionalGeneration, diff --git a/examples/research_projects/visual_bert/extracting_data.py b/examples/research_projects/visual_bert/extracting_data.py index 9790e20ad86bf9..9c445be336f553 100644 --- a/examples/research_projects/visual_bert/extracting_data.py +++ b/examples/research_projects/visual_bert/extracting_data.py @@ -9,9 +9,9 @@ import datasets import numpy as np import torch - from modeling_frcnn import GeneralizedRCNN from processing_image import Preprocess + from utils import Config diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py index 33c1133e9589f4..08758b1d3cac06 100644 --- a/examples/research_projects/visual_bert/modeling_frcnn.py +++ b/examples/research_projects/visual_bert/modeling_frcnn.py @@ -169,7 +169,6 @@ def get_norm(norm, out_channels): def _create_grid_offsets(size: List[int], stride: int, offset: float, device): - grid_height, grid_width = size shifts_x = torch.arange( offset * stride, @@ -390,7 +389,6 @@ def assign_boxes_to_levels( canonical_box_size: int, canonical_level: int, ): - box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists])) # Eqn.(1) in FPN paper level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)) @@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): - assert from_tf, ( - "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint" - .format(pretrained_model_name_or_path + ".index") + assert ( + from_tf + ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( + pretrained_model_name_or_path + ".index" ) archive_file = pretrained_model_name_or_path + ".index" else: diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py index 8e830fb8359d29..2fc6ea2062efd2 100644 --- a/examples/research_projects/visual_bert/utils.py +++ b/examples/research_projects/visual_bert/utils.py @@ -34,14 +34,13 @@ from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile -import numpy as np -from PIL import Image -from tqdm.auto import tqdm - import cv2 +import numpy as np import requests import wget from filelock import FileLock +from PIL import Image +from tqdm.auto import tqdm from yaml import Loader, dump, load @@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs): @classmethod def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs): - cache_dir = kwargs.pop("cache_dir", None) force_download = kwargs.pop("force_download", False) resume_download = kwargs.pop("resume_download", False) @@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs): # quick compare tensors def compare(in_tensor): - out_tensor = torch.load("dump.pt", map_location=in_tensor.device) n1 = in_tensor.numpy() n2 = out_tensor.numpy()[0] print(n1.shape, n1[0, 0, :5]) print(n2.shape, n2[0, 0, :5]) assert np.allclose(n1, n2, rtol=0.01, atol=0.1), ( - f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %" + f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %" " element-wise mismatch" ) raise Exception("tensors are all good") @@ -300,7 +297,6 @@ def get_from_cache( user_agent=None, local_files_only=False, ): - if cache_dir is None: cache_dir = TRANSFORMERS_CACHE if isinstance(cache_dir, Path): @@ -355,7 +351,6 @@ def get_from_cache( # Prevent parallel downloads of the same file with a lock. lock_path = cache_path + ".lock" with FileLock(lock_path): - # If the download just completed while the lock was activated. if os.path.exists(cache_path) and not force_download: # Even if returning early like here, the lock will be released. @@ -406,7 +401,6 @@ def _resumable_file_manager(): def url_to_filename(url, etag=None): - url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() diff --git a/examples/research_projects/visual_bert/visualizing_image.py b/examples/research_projects/visual_bert/visualizing_image.py index a02dc66dfb7c61..163d661e873ec3 100644 --- a/examples/research_projects/visual_bert/visualizing_image.py +++ b/examples/research_projects/visual_bert/visualizing_image.py @@ -18,6 +18,7 @@ import colorsys import io +import cv2 import matplotlib as mpl import matplotlib.colors as mplc import matplotlib.figure as mplfigure @@ -25,7 +26,6 @@ import torch from matplotlib.backends.backend_agg import FigureCanvasAgg -import cv2 from utils import img_tensorize diff --git a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py index c936148c554368..b5a23c15b2b1a9 100644 --- a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py +++ b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py @@ -1,15 +1,15 @@ import os from glob import glob +import imageio import torch import torchvision -from PIL import Image -from torch import nn - -import imageio import wandb from img_processing import custom_to_pil, loop_post_process, preprocess, preprocess_vqgan from loaders import load_vqgan +from PIL import Image +from torch import nn + from transformers import CLIPModel, CLIPTokenizerFast from utils import get_device, get_timestamp, show_pil diff --git a/examples/research_projects/vqgan-clip/loaders.py b/examples/research_projects/vqgan-clip/loaders.py index 3fd86522dcad22..e8650f72128456 100644 --- a/examples/research_projects/vqgan-clip/loaders.py +++ b/examples/research_projects/vqgan-clip/loaders.py @@ -1,7 +1,6 @@ import importlib import torch - import yaml from omegaconf import OmegaConf from taming.models.vqgan import VQModel diff --git a/examples/research_projects/wav2vec2/alignment.py b/examples/research_projects/wav2vec2/alignment.py index 24347a55a0bce7..55b477f5ee967a 100644 --- a/examples/research_projects/wav2vec2/alignment.py +++ b/examples/research_projects/wav2vec2/alignment.py @@ -176,7 +176,6 @@ def merge_repeats(path): out_align.write(str(seg) + "\n") def align_data(self, wav_dir, text_file, output_dir): - if not os.path.exists(output_dir): os.makedirs(output_dir) diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py index 692aa39796a769..15d2f12c7ddb56 100755 --- a/examples/research_projects/wav2vec2/run_asr.py +++ b/examples/research_projects/wav2vec2/run_asr.py @@ -7,13 +7,13 @@ from typing import Any, Callable, Dict, List, Optional, Set, Union import datasets +import librosa import numpy as np import torch +from lang_trans import arabic from packaging import version from torch import nn -import librosa -from lang_trans import arabic from transformers import ( HfArgumentParser, Trainer, diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py index 8e0801429e61ec..985e6df40e31d1 100755 --- a/examples/research_projects/wav2vec2/run_pretrain.py +++ b/examples/research_projects/wav2vec2/run_pretrain.py @@ -4,12 +4,12 @@ from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Union +import librosa import torch from datasets import DatasetDict, load_dataset from packaging import version from torch import nn -import librosa from transformers import ( HfArgumentParser, Trainer, diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py index a414f7db9770e6..8f181409d6d7e6 100644 --- a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py +++ b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py @@ -126,7 +126,6 @@ def run_and_check( quality_checks: bool = True, fp16: bool = True, ): - model_name = models[model] output_dir = self.run_trainer( @@ -151,7 +150,6 @@ def run_trainer( distributed: bool = True, fp16: bool = True, ): - output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False) args = f""" --model_name_or_path {model_name} diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py index 16fc1ac8a39c32..38ed3376ecc0ba 100644 --- a/examples/research_projects/xtreme-s/run_xtreme_s.py +++ b/examples/research_projects/xtreme-s/run_xtreme_s.py @@ -327,7 +327,6 @@ class DataTrainingArguments: @dataclass class SpeechDataCollatorWithPadding: - processor: AutoProcessor decoder_start_token_id: Optional[int] = None padding: Union[bool, str] = "longest" @@ -863,7 +862,6 @@ def compute_classification_metric(pred): # Training if training_args.do_train: - # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index f8d6cf0d1984ab..d9fcc8daafa62d 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -26,12 +26,12 @@ from dataclasses import dataclass, field from typing import Optional +import evaluate import numpy as np import tensorflow as tf from datasets import load_dataset from PIL import Image -import evaluate import transformers from transformers import ( MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index fecc55853982b0..f5bc179a96ac11 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -117,6 +117,7 @@ def __call__(self, features): # endregion + # region Arguments @dataclass class ModelArguments: diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index e5d6018e5c7187..1c3acd34aedd9d 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -26,10 +26,11 @@ from pathlib import Path from typing import Optional +import evaluate import tensorflow as tf from datasets import load_dataset +from utils_qa import postprocess_qa_predictions -import evaluate import transformers from transformers import ( AutoConfig, @@ -44,7 +45,6 @@ set_seed, ) from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry -from utils_qa import postprocess_qa_predictions # Will error if the minimal version of Transformers is not installed. Remove at your own risks. @@ -214,6 +214,7 @@ def __post_init__(self): # endregion + # region Helper classes class SavePretrainedCallback(tf.keras.callbacks.Callback): # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary @@ -610,7 +611,6 @@ def compute_metrics(p: EvalPrediction): # endregion with training_args.strategy.scope(): - dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF num_replicas = training_args.strategy.num_replicas_in_sync @@ -628,7 +628,6 @@ def compute_metrics(p: EvalPrediction): use_auth_token=True if model_args.use_auth_token else None, ) if training_args.do_train: - training_dataset = model.prepare_tf_dataset( processed_datasets["train"], shuffle=True, diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index b8598e2fdcd9b5..61ee9c2ba6d37f 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -26,14 +26,14 @@ from typing import Optional import datasets +import evaluate import nltk # Here to have a nice missing dependency error message early on import numpy as np import tensorflow as tf from datasets import load_dataset +from filelock import FileLock -import evaluate import transformers -from filelock import FileLock from transformers import ( AutoConfig, AutoTokenizer, diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index dceb0783f873f1..bf03901011fa4b 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -23,11 +23,11 @@ from dataclasses import dataclass, field from typing import Optional +import evaluate import numpy as np import tensorflow as tf from datasets import load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py index 86fe819e28912a..7b90938f02d7ae 100644 --- a/examples/tensorflow/token-classification/run_ner.py +++ b/examples/tensorflow/token-classification/run_ner.py @@ -26,10 +26,10 @@ from typing import Optional import datasets +import evaluate import tensorflow as tf from datasets import ClassLabel, load_dataset -import evaluate import transformers from transformers import ( CONFIG_MAPPING, diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 206386bc684131..09c0b8a9ea7ed0 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -26,11 +26,11 @@ from typing import Optional import datasets +import evaluate import numpy as np import tensorflow as tf from datasets import load_dataset -import evaluate import transformers from transformers import ( AutoConfig, diff --git a/pyproject.toml b/pyproject.toml index 291558c9a3deaa..e2d594210f79ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,13 @@ [tool.black] line-length = 119 -target-version = ['py35'] +target-version = ['py37'] + +[tool.ruff] +# Never enforce `E501` (line length violations). +ignore = ["E501", "E741", "W605"] +select = ["E", "F", "I", "W"] +line-length = 119 + +[tool.ruff.isort] +lines-after-imports = 2 +known-first-party = ["transformers"] diff --git a/setup.cfg b/setup.cfg index 2d605ccceca788..9a56ddc2fc65ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,43 +4,6 @@ ensure_newline_before_comments = True force_grid_wrap = 0 include_trailing_comma = True known_first_party = transformers -known_third_party = - absl - conllu - datasets - elasticsearch - fairseq - faiss-cpu - fastprogress - fire - fugashi - git - h5py - matplotlib - nltk - numpy - packaging - pandas - PIL - psutil - pytest - pytorch_lightning - rouge_score - sacrebleu - seqeval - sklearn - streamlit - tensorboardX - tensorflow - tensorflow_datasets - timeout_decorator - torch - torchaudio - torchtext - torchvision - torch_xla - tqdm - line_length = 119 lines_after_imports = 2 multi_line_output = 3 diff --git a/setup.py b/setup.py index c5085c14874ef3..aad145b145f275 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ _deps = [ "Pillow", "accelerate>=0.10.0", - "black==22.3", # after updating to black 2023, also update Python version in pyproject.toml to 3.7 + "black~=23.1", "codecarbon==1.2.0", "cookiecutter==1.7.3", "dataclasses", @@ -111,7 +111,6 @@ "faiss-cpu", "fastapi", "filelock", - "flake8>=3.8.3", "flax>=0.4.1", "ftfy", "fugashi>=1.0", @@ -150,6 +149,7 @@ "requests", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", + "ruff>=0.0.241", "sacrebleu>=1.4.12,<2.0.0", "sacremoses", "safetensors>=0.2.1", @@ -321,7 +321,7 @@ def run(self): extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"] -extras["quality"] = deps_list("black", "datasets", "isort", "flake8", "GitPython", "hf-doc-builder") +extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder") extras["all"] = ( extras["tf"] diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py index 7f95e4b40b7cd3..3c5c877a454e63 100644 --- a/src/transformers/benchmark/benchmark.py +++ b/src/transformers/benchmark/benchmark.py @@ -48,7 +48,6 @@ class PyTorchBenchmark(Benchmark): - args: PyTorchBenchmarkArguments configs: PretrainedConfig framework: str = "PyTorch" diff --git a/src/transformers/benchmark/benchmark_args.py b/src/transformers/benchmark/benchmark_args.py index 26c0eb95a4bcca..b5887e4a9bcb4b 100644 --- a/src/transformers/benchmark/benchmark_args.py +++ b/src/transformers/benchmark/benchmark_args.py @@ -33,7 +33,6 @@ @dataclass class PyTorchBenchmarkArguments(BenchmarkArguments): - deprecated_args = [ "no_inference", "no_cuda", diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py index 12cb6f5cbbeb86..c1c2ec16ce550c 100644 --- a/src/transformers/benchmark/benchmark_args_tf.py +++ b/src/transformers/benchmark/benchmark_args_tf.py @@ -30,7 +30,6 @@ @dataclass class TensorFlowBenchmarkArguments(BenchmarkArguments): - deprecated_args = [ "no_inference", "no_cuda", diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py index b5fd4b71b562a2..126172ffbd3000 100644 --- a/src/transformers/benchmark/benchmark_tf.py +++ b/src/transformers/benchmark/benchmark_tf.py @@ -77,7 +77,6 @@ def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) -> class TensorFlowBenchmark(Benchmark): - args: TensorFlowBenchmarkArguments configs: PretrainedConfig framework: str = "TensorFlow" diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py index 79740805807185..a6c6353c19fb37 100644 --- a/src/transformers/benchmark/benchmark_utils.py +++ b/src/transformers/benchmark/benchmark_utils.py @@ -890,7 +890,6 @@ def save_to_csv(self, result_dict, filename): return self.print_fn("Saving results to csv.") with open(filename, mode="w") as csv_file: - assert len(self.args.model_names) > 0, f"At least 1 model should be defined, but got {self.model_names}" fieldnames = ["model", "batch_size", "sequence_length"] diff --git a/src/transformers/commands/convert.py b/src/transformers/commands/convert.py index 8c3e37bfcf3d1c..a8cf431cb0876b 100644 --- a/src/transformers/commands/convert.py +++ b/src/transformers/commands/convert.py @@ -71,7 +71,7 @@ def __init__( pytorch_dump_output: str, config: str, finetuning_task_name: str, - *args + *args, ): self._logger = logging.get_logger("transformers-cli/converting") diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py index b65249f290856a..669f7a98003bac 100644 --- a/src/transformers/commands/pt_to_tf.py +++ b/src/transformers/commands/pt_to_tf.py @@ -17,11 +17,10 @@ from argparse import ArgumentParser, Namespace from importlib import import_module +import huggingface_hub import numpy as np from packaging import version -import huggingface_hub - from .. import ( FEATURE_EXTRACTOR_MAPPING, IMAGE_PROCESSOR_MAPPING, @@ -145,7 +144,6 @@ def find_pt_tf_differences(pt_outputs, tf_outputs): # 2. For each output attribute, computes the difference def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""): - # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in # recursivelly, keeping the name of the attribute. if isinstance(pt_out, torch.Tensor): @@ -177,7 +175,7 @@ def __init__( no_pr: bool, push: bool, extra_commit_description: str, - *args + *args, ): self._logger = logging.get_logger("transformers-cli/pt_to_tf") self._model_name = model_name diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py index 4deae833f712e1..803ae71d1c122e 100644 --- a/src/transformers/commands/serving.py +++ b/src/transformers/commands/serving.py @@ -122,7 +122,6 @@ def register_subcommand(parser: ArgumentParser): serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int): - self._pipeline = pipeline self.host = host diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index ce4350b3b1f284..5449d98237ea64 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -328,7 +328,6 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): try: import tensorflow as tf - import tf2onnx from tf2onnx import __version__ as t2ov @@ -358,7 +357,7 @@ def convert( tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", - **model_kwargs + **model_kwargs, ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py index 62a071dd3cc135..f1358408a5cb57 100755 --- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -358,7 +358,6 @@ def convert_all_pt_checkpoints_to_tf( remove_cached_files=False, only_convert_finetuned_models=False, ): - if args_model_type is None: model_types = list(MODEL_CLASSES.keys()) else: diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index befa22c2e1787b..72df3bece21925 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -20,9 +20,8 @@ from typing import List, Optional, Union import torch -from torch.utils.data import Dataset - from filelock import FileLock +from torch.utils.data import Dataset from ...tokenization_utils_base import PreTrainedTokenizerBase from ...utils import logging @@ -121,7 +120,6 @@ def __init__( # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): - if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 8c6c98b264f363..6c23bf23cf14d4 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -21,9 +21,8 @@ from typing import Dict, List, Optional import torch -from torch.utils.data import Dataset - from filelock import FileLock +from torch.utils.data import Dataset from ...tokenization_utils import PreTrainedTokenizer from ...utils import logging @@ -72,7 +71,6 @@ def __init__( # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): - if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py index e1c8c9cb6c0500..d81217d818afff 100644 --- a/src/transformers/data/datasets/squad.py +++ b/src/transformers/data/datasets/squad.py @@ -19,9 +19,8 @@ from typing import Dict, List, Optional, Union import torch -from torch.utils.data import Dataset - from filelock import FileLock +from torch.utils.data import Dataset from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING from ...tokenization_utils import PreTrainedTokenizer diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py index aca2b7ffd43cb2..c9c998867ae0e1 100644 --- a/src/transformers/data/metrics/__init__.py +++ b/src/transformers/data/metrics/__init__.py @@ -20,9 +20,8 @@ if is_sklearn_available(): - from sklearn.metrics import f1_score, matthews_corrcoef - from scipy.stats import pearsonr, spearmanr + from sklearn.metrics import f1_score, matthews_corrcoef DEPRECATION_WARNING = ( diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 64137c95aca266..0f8bd248055115 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -166,7 +166,6 @@ def squad_convert_example_to_features( span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): - # Define the side we want to truncate / pad and the text/pair sorting if tokenizer.padding_side == "right": texts = truncated_query diff --git a/src/transformers/data/test_generation_utils.py b/src/transformers/data/test_generation_utils.py index b08dd88026ba45..a69b5683de755d 100644 --- a/src/transformers/data/test_generation_utils.py +++ b/src/transformers/data/test_generation_utils.py @@ -73,7 +73,6 @@ def test_postprocess_next_token_scores(self): @timeout_decorator.timeout(10) def test_postprocess_next_token_scores_large_bad_words_list(self): - config = self.config model = self.model # Initialize an input id tensor with batch size 8 and sequence length 12 diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 9465307f5f57ea..037c5985f88b94 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -344,7 +344,6 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs) if resume_from_checkpoint is not None: - # it's possible that the user is trying to resume from model_path, which doesn't necessarily # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's # a resume from a checkpoint and not just a local pretrained weight. So we check here if the diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 1a256380f0f6d7..d60da273bf001d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -4,7 +4,7 @@ deps = { "Pillow": "Pillow", "accelerate": "accelerate>=0.10.0", - "black": "black==22.3", + "black": "black~=23.1", "codecarbon": "codecarbon==1.2.0", "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", @@ -17,7 +17,6 @@ "faiss-cpu": "faiss-cpu", "fastapi": "fastapi", "filelock": "filelock", - "flake8": "flake8>=3.8.3", "flax": "flax>=0.4.1", "ftfy": "ftfy", "fugashi": "fugashi>=1.0", @@ -56,6 +55,7 @@ "requests": "requests", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", + "ruff": "ruff>=0.0.241", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.2.1", diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py index 8b720e6a77358c..d29675f75754d0 100644 --- a/src/transformers/generation/beam_search.py +++ b/src/transformers/generation/beam_search.py @@ -98,7 +98,7 @@ def process( next_scores: torch.FloatTensor, next_tokens: torch.LongTensor, next_indices: torch.LongTensor, - **kwargs + **kwargs, ) -> Tuple[torch.Tensor]: raise NotImplementedError("This is an abstract method.") @@ -111,7 +111,7 @@ def finalize( next_tokens: torch.LongTensor, next_indices: torch.LongTensor, max_length: int, - **kwargs + **kwargs, ) -> torch.LongTensor: raise NotImplementedError("This is an abstract method.") @@ -574,7 +574,6 @@ def process( batch_beam_idx = batch_idx * self.group_size + next_index # add to generated hypotheses if end of sentence if (eos_token_id is not None) and (next_token.item() in eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size if is_beam_token_worse_than_top_num_beams: diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index e01b0dc40aa70a..f2b6a76e11e774 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -322,7 +322,7 @@ def save_pretrained( save_directory: Union[str, os.PathLike], config_file_name: Optional[Union[str, os.PathLike]] = None, push_to_hub: bool = False, - **kwargs + **kwargs, ): r""" Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the @@ -372,7 +372,7 @@ def from_pretrained( cls, pretrained_model_name: Union[str, os.PathLike], config_file_name: Optional[Union[str, os.PathLike]] = None, - **kwargs + **kwargs, ) -> "GenerationConfig": r""" Instantiate a [`GenerationConfig`] from a generation configuration file. diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py index 92e3cff82cb5c2..ccbe7408ff095a 100644 --- a/src/transformers/generation/flax_logits_process.py +++ b/src/transformers/generation/flax_logits_process.py @@ -258,7 +258,6 @@ def __init__(self, min_length: int, eos_token_id: int): self.eos_token_id = eos_token_id def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray: - # create boolean flag to decide if min length penalty should be applied apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1) diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 1cfc07b9786c8b..b615b3fa3592ee 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -21,11 +21,10 @@ from functools import partial from typing import Any, Dict, Optional, Union -import numpy as np - import flax import jax import jax.numpy as jnp +import numpy as np from jax import lax from ..models.auto import ( diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py index 19a14c63b2c877..369c969526a4f7 100644 --- a/src/transformers/generation/tf_logits_process.py +++ b/src/transformers/generation/tf_logits_process.py @@ -296,7 +296,6 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor): """ def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): - if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0: raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.") if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids): @@ -421,7 +420,6 @@ def _get_generated_ngrams(hypo_idx): return banned_tokens def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor: - # TODO (joao): enable XLA on this logits processor. See discussion and attempts in # https://github.com/huggingface/transformers/pull/16974 if not tf.executing_eagerly(): diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py index ec246aaff0e243..b61398163f5435 100644 --- a/src/transformers/generation/tf_utils.py +++ b/src/transformers/generation/tf_utils.py @@ -951,7 +951,6 @@ def _prepare_decoder_input_ids_for_generation( bos_token_id: int = None, model_kwargs: Optional[Dict[str, tf.Tensor]] = None, ) -> tf.Tensor: - # prepare `input_ids` for decoder if model is encoder-decoder if model_kwargs is not None and "decoder_input_ids" in model_kwargs: return model_kwargs.pop("decoder_input_ids") @@ -2459,7 +2458,6 @@ def contrastive_search_body_fn( # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values; # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step if model_kwargs.get("past_key_values") is None: - # prepare inputs model_inputs = self.prepare_inputs_for_generation( generated[:, :cur_len], use_cache=use_cache, **model_kwargs @@ -2676,7 +2674,13 @@ def contrastive_search_body_fn( generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables ): maximum_iterations = max_length - cur_len - generated, _, cur_len, _, _, = tf.while_loop( + ( + generated, + _, + cur_len, + _, + _, + ) = tf.while_loop( contrastive_search_cond_fn, contrastive_search_body_fn, (generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables), diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index f81d76a966c913..9fd563fe388a57 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -17,9 +17,8 @@ from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union import numpy as np -from packaging import version - import requests +from packaging import version from .utils import ( ExplicitEnum, diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index f1fa04d1936a76..b1ebdba9a07f27 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -354,6 +354,7 @@ def dynamic_modules_import_trainable(*args, **kwargs): def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> BestRun: import sigopt + from transformers.utils.versions import importlib_metadata if trainer.args.process_index == 0: @@ -723,7 +724,6 @@ def setup(self, args, state, model, **kwargs): init_args["name"] = args.run_name if self._wandb.run is None: - self._wandb.init( project=os.getenv("WANDB_PROJECT", "huggingface"), **init_args, @@ -1158,7 +1158,7 @@ def __init__( run: Optional["Run"] = None, log_parameters: bool = True, log_checkpoints: Optional[str] = None, - **neptune_run_kwargs + **neptune_run_kwargs, ): if not is_neptune_available(): raise ValueError( diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py index f99bd738ea7c58..4fd2da18a6b5e3 100644 --- a/src/transformers/keras_callbacks.py +++ b/src/transformers/keras_callbacks.py @@ -6,11 +6,10 @@ import numpy as np import tensorflow as tf +from huggingface_hub import Repository, create_repo from packaging.version import parse from tensorflow.keras.callbacks import Callback -from huggingface_hub import Repository, create_repo - from . import IntervalStrategy, PreTrainedTokenizerBase from .modelcard import TrainingSummary from .utils import get_full_repo_name @@ -320,7 +319,7 @@ def __init__( hub_model_id: Optional[str] = None, hub_token: Optional[str] = None, checkpoint: bool = False, - **model_card_args + **model_card_args, ): super().__init__() if checkpoint and save_strategy != "epoch": diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py index 47da8c2871b321..e013e74eef4d09 100644 --- a/src/transformers/modeling_flax_pytorch_utils.py +++ b/src/transformers/modeling_flax_pytorch_utils.py @@ -19,14 +19,14 @@ from pickle import UnpicklingError from typing import Dict, Tuple -import numpy as np - import jax import jax.numpy as jnp -import transformers +import numpy as np from flax.serialization import from_bytes from flax.traverse_util import flatten_dict, unflatten_dict +import transformers + from .utils import logging @@ -130,7 +130,6 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model): # Need to change some parameters name to match Flax names for pt_key, pt_tensor in pt_state_dict.items(): - pt_tuple_key = tuple(pt_key.split(".")) # remove base model prefix if necessary @@ -187,7 +186,6 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model): ) # Need to change some parameters name to match Flax names for pt_key, pt_tensor in pt_state_dict.items(): - pt_tuple_key = tuple(pt_key.split(".")) # remove base model prefix if necessary diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index c501764350c296..a635c7b62b322e 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -484,9 +484,8 @@ def from_pretrained( pretrained_model_name_or_path: Union[str, os.PathLike], dtype: jnp.dtype = jnp.float32, *model_args, - **kwargs + **kwargs, ): - r""" Instantiate a pretrained flax model from a pre-trained model configuration. @@ -810,7 +809,6 @@ def from_pretrained( if from_pt: state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file, is_sharded) else: - if is_sharded: state = cls.load_flax_sharded_weights(resolved_archive_file) else: diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 42a6adea12d1fc..1a313ec959cb8d 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -30,9 +30,9 @@ import h5py import numpy as np import tensorflow as tf +from huggingface_hub import Repository, list_repo_files from packaging.version import parse -from huggingface_hub import Repository, list_repo_files from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files from . import DataCollatorWithPadding, DefaultDataCollator @@ -1402,7 +1402,7 @@ def compile( weighted_metrics=None, run_eagerly=None, steps_per_execution=None, - **kwargs + **kwargs, ): """ This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss @@ -1501,7 +1501,6 @@ def train_step(self, data): # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments, # if those keys are not already present in the input dict if self._using_dummy_loss and y is not None: - # If y is a tensor and the model only has one label-like input, map y to that input if len(label_kwargs) == 1 and isinstance(y, tf.Tensor): if isinstance(x, tf.Tensor): @@ -2228,7 +2227,7 @@ def save_pretrained( max_shard_size: Union[int, str] = "10GB", create_pr: bool = False, safe_serialization: bool = False, - **kwargs + **kwargs, ): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the @@ -2893,7 +2892,7 @@ def push_to_hub( private: Optional[bool] = None, use_auth_token: Optional[Union[bool, str]] = None, max_shard_size: Optional[Union[int, str]] = "10GB", - **model_card_kwargs + **model_card_kwargs, ) -> str: """ Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`. diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 484467c0c13536..e03f87f92f9c4c 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -962,7 +962,6 @@ def floating_point_ops( class BackboneMixin: def forward_with_filtered_kwargs(self, *args, **kwargs): - signature = dict(inspect.signature(self.forward).parameters) filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature} @@ -2520,7 +2519,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) raise elif from_pt: - # restore default dtype if dtype_orig is not None: torch.set_default_dtype(dtype_orig) diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py index d11d49a2c78fcc..fd0c6238879257 100644 --- a/src/transformers/models/albert/configuration_albert.py +++ b/src/transformers/models/albert/configuration_albert.py @@ -132,7 +132,7 @@ def __init__( pad_token_id=0, bos_token_id=2, eos_token_id=3, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index dc9559ac3624ef..687a927ef0c486 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -631,7 +631,6 @@ class AlbertForPreTrainingOutput(ModelOutput): ALBERT_START_DOCSTRING, ) class AlbertModel(AlbertPreTrainedModel): - config_class = AlbertConfig base_model_prefix = "albert" @@ -912,7 +911,6 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor: ALBERT_START_DOCSTRING, ) class AlbertForMaskedLM(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [ "predictions.decoder.weight", @@ -1133,7 +1131,6 @@ def forward( ALBERT_START_DOCSTRING, ) class AlbertForTokenClassification(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config: AlbertConfig): @@ -1219,7 +1216,6 @@ def forward( ALBERT_START_DOCSTRING, ) class AlbertForQuestionAnswering(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config: AlbertConfig): diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py index e55038c8acff72..0ff1b9276a19d6 100644 --- a/src/transformers/models/albert/modeling_flax_albert.py +++ b/src/transformers/models/albert/modeling_flax_albert.py @@ -15,12 +15,11 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen.attention import dot_product_attention_weights from flax.traverse_util import flatten_dict, unflatten_dict @@ -523,7 +522,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -702,7 +701,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - # Model outputs = self.albert( input_ids, diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py index 823c7c48bbb572..247ee395dc60fe 100644 --- a/src/transformers/models/albert/modeling_tf_albert.py +++ b/src/transformers/models/albert/modeling_tf_albert.py @@ -581,7 +581,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 5bebb936cf7d11..b043a14989fc57 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -147,7 +147,7 @@ def __init__( cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py index c2ffcd90b192e9..16c54e7eac6c94 100644 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ b/src/transformers/models/albert/tokenization_albert_fast.py @@ -135,7 +135,7 @@ def __init__( pad_token="", cls_token="[CLS]", mask_token="[MASK]", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 25d7fa69c18ce0..523cf420e0aed5 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -117,7 +117,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, project_dim=768, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -208,7 +208,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -228,7 +228,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from AltCLIPConfig diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index d21e329460cd3d..7d150d5734e1e8 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -637,7 +637,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py index 19f85189ad0dbd..22b0ca70ac8520 100644 --- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -105,7 +105,7 @@ def __init__( time_stride=10, max_length=1024, num_mel_bins=128, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py index f339bbc6c2bf53..32e0f33d04fdb2 100644 --- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py +++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py @@ -22,8 +22,8 @@ import torch import torchaudio from datasets import load_dataset - from huggingface_hub import hf_hub_download + from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification from transformers.utils import logging diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index f61f62b4783e96..deda2fc7781b28 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -73,7 +73,7 @@ def __init__( mean=-4.2677393, std=4.5689974, return_attention_mask=False, - **kwargs + **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) self.num_mel_bins = num_mel_bins @@ -127,7 +127,7 @@ def __call__( raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], sampling_rate: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index de67d477445069..54b77df7458d2c 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -188,7 +188,6 @@ def __init__(self, config: ASTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -246,7 +245,6 @@ def __init__(self, config: ASTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index 2558311a44bd78..2a04657f419909 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -139,7 +139,7 @@ def __init__( is_encoder_decoder=True, decoder_start_token_id=2, forced_eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 136a5eebc40090..2aac055f85f0bf 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -1084,7 +1084,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1109,7 +1108,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1212,7 +1210,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: - # different to other models, Bart automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -1424,7 +1421,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past_key_values is used if past_key_values is not None: diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py index 01b2bf8ecec144..ac292cc77707db 100644 --- a/src/transformers/models/bart/modeling_flax_bart.py +++ b/src/transformers/models/bart/modeling_flax_bart.py @@ -19,11 +19,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -916,7 +915,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1472,7 +1471,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape @@ -1748,7 +1747,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): config.is_decoder = True config.is_encoder_decoder = False @@ -1920,7 +1919,6 @@ def __call__( return_dict: bool = True, deterministic: bool = True, ): - outputs = self.model( input_ids, attention_mask, diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 44b230be0eb9b0..6e29434c4df158 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -803,7 +803,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1109,9 +1108,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]: - # different to other models, Bart automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -1185,7 +1183,6 @@ def call( BART_START_DOCSTRING, ) class TFBartModel(TFBartPretrainedModel): - _requires_load_weight_prefix = True def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs): @@ -1225,9 +1222,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: - outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -1442,9 +1438,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index a6c5fdc6811549..5dc93578109f8f 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -192,7 +192,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 26c546be001a24..6d6e29986be4fa 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -166,7 +166,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py index df1a006d3154e8..da0f2a93bdce86 100644 --- a/src/transformers/models/barthez/__init__.py +++ b/src/transformers/models/barthez/__init__.py @@ -41,7 +41,6 @@ if TYPE_CHECKING: - try: if not is_sentencepiece_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 2e58db113e151e..77ab8a9d64166b 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -134,7 +134,7 @@ def __init__( pad_token="", mask_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index a7f36e007c146f..f53a5acd712c71 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -127,7 +127,7 @@ def __init__( unk_token="", pad_token="", mask_token="", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index b12a962eae3e23..1c1ef0b8675b8a 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -132,7 +132,7 @@ def __init__( pad_token="", mask_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py index c44c59942f03bc..ef7bf22b9189cf 100644 --- a/src/transformers/models/beit/configuration_beit.py +++ b/src/transformers/models/beit/configuration_beit.py @@ -147,7 +147,7 @@ def __init__( auxiliary_num_convs=1, auxiliary_concat_input=False, semantic_loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -186,7 +186,6 @@ def __init__( # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig class BeitOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py index 12da57ea386c5e..f80d52db7d8893 100644 --- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py +++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py @@ -19,12 +19,12 @@ import json from pathlib import Path +import requests import torch from datasets import load_dataset +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( BeitConfig, BeitFeatureExtractor, diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 4fa8b0aab0e890..6f73679a71baf6 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -105,7 +105,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_reduce_labels: bool = False, - **kwargs + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -157,7 +157,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). @@ -184,7 +184,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any @@ -206,7 +206,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -227,7 +227,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 803f602f92a2f7..4caf0f478fb6db 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -149,7 +149,6 @@ def __init__(self, config: BeitConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor: - embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py index 8e9f87b29448ed..02fb2e5e338dfa 100644 --- a/src/transformers/models/beit/modeling_flax_beit.py +++ b/src/transformers/models/beit/modeling_flax_beit.py @@ -16,12 +16,11 @@ from typing import Callable, List, Optional, Tuple -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen.attention import dot_product_attention_weights from flax.traverse_util import flatten_dict, unflatten_dict @@ -166,7 +165,6 @@ def __call__(self, inputs, deterministic: Optional[bool] = True): class FlaxBeitPatchEmbeddings(nn.Module): - config: BeitConfig dtype: jnp.dtype = jnp.float32 # the dtype of the computation @@ -217,7 +215,6 @@ def setup(self): self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) def __call__(self, pixel_values, bool_masked_pos=None, deterministic=True): - embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.shape @@ -518,7 +515,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None @@ -605,7 +601,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) if input_shape is None: @@ -713,7 +709,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - hidden_states = self.embeddings(pixel_values, bool_masked_pos, deterministic=deterministic) outputs = self.encoder( diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index b2d64b7fde67da..589c2b0261854b 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -156,7 +156,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py index a58240c8c3c2f7..68ed9bafc873ac 100644 --- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py +++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py @@ -26,7 +26,6 @@ def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): - """ Args: model: BertModel Pytorch model instance to be converted diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 61355216653cb0..eb0e0d21665034 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -584,7 +584,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1158,7 +1157,6 @@ def forward( """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING ) class BertLMHeadModel(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] @@ -1299,7 +1297,6 @@ def _reorder_cache(self, past, beam_idx): @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] @@ -1713,7 +1710,6 @@ def forward( BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1799,7 +1795,6 @@ def forward( BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py index 818a3ee0896452..6e8eb829b90903 100644 --- a/src/transformers/models/bert/modeling_flax_bert.py +++ b/src/transformers/models/bert/modeling_flax_bert.py @@ -15,12 +15,11 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -775,7 +774,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class( config=config, @@ -1059,7 +1058,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - # Model outputs = self.bert( input_ids, diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 834b6237d36e4a..5391d71a916c3b 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -759,7 +759,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index d398fc5154aec3..8d13bb4e546c22 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -194,7 +194,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -242,7 +242,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index b057f7e4ce797c..e55f3f36ad6dd3 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -216,7 +216,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py index e7ef0b411d2ec0..31171cdab112e5 100644 --- a/src/transformers/models/bert/tokenization_bert_tf.py +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -2,7 +2,6 @@ from typing import List, Union import tensorflow as tf - from tensorflow_text import BertTokenizer as BertTokenizerLayer from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index d602de22f044fe..f0cb795d93615f 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -100,7 +100,7 @@ def __init__( eos_token_id=1, position_embedding_type="absolute", use_cache=True, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 54044195e961ad..928cd4433e1ef5 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -394,7 +394,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -464,7 +463,6 @@ def load_tf_weights_in_bert_generation( try: import numpy as np import tensorflow.compat.v1 as tf - import tensorflow_hub as hub import tensorflow_text # noqa: F401 diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index 711dcdf50c2589..6ef3321277f365 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -92,7 +92,7 @@ def __init__( pad_token="", sep_token="<::::>", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 27d66ae9a990c7..5af9984bb9e9b0 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -158,7 +158,7 @@ def __init__( mecab_kwargs=None, sudachi_kwargs=None, jumanpp_kwargs=None, - **kwargs + **kwargs, ): super().__init__( spm_file=spm_file, diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index 50de2db480fa30..837fea136743d2 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -132,7 +132,7 @@ def __init__( unk_token="", pad_token="", mask_token="", - **kwargs + **kwargs, ): super().__init__( normalization=normalization, diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py index d9bcbfef081b53..53bf1ee6f44b75 100644 --- a/src/transformers/models/big_bird/configuration_big_bird.py +++ b/src/transformers/models/big_bird/configuration_big_bird.py @@ -131,7 +131,7 @@ def __init__( block_size=64, num_random_blocks=3, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py index 614443d81a5e79..34db9771b1e734 100644 --- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py @@ -43,7 +43,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, p if __name__ == "__main__": - parser = argparse.ArgumentParser() # Required parameters parser.add_argument( diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index ff5eeac628d4c2..cdb9e787b791e3 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -534,7 +534,6 @@ def bigbird_block_sparse_attention( plan_num_rand_blocks, output_attentions, ): - # BigBird block-sparse attention as suggested in paper # ITC: @@ -1606,7 +1605,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1632,7 +1630,6 @@ def custom_forward(*inputs): blocked_encoder_mask, ) else: - layer_outputs = layer_module( hidden_states, attention_mask, @@ -2178,7 +2175,6 @@ def forward( @staticmethod def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): - batch_size, seq_length = attention_mask.size() if seq_length % block_size != 0: raise ValueError( diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index 2dfa871b118191..2c3806c754e9cd 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -15,12 +15,11 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -481,7 +480,6 @@ def __call__( @staticmethod def create_masks_for_block_sparse_attn(attention_mask, block_size: int): - batch_size, seq_length = attention_mask.shape if seq_length % block_size != 0: raise ValueError( @@ -1578,7 +1576,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) if config.attention_type == "block_sparse" and input_shape is None: @@ -1862,7 +1860,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - # Model outputs = self.bert( input_ids, @@ -2181,7 +2178,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): if config.attention_type == "block_sparse" and input_shape is None: input_shape = (1, 1, 12 * config.block_size) @@ -2328,7 +2325,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - # Model outputs = self.bert( input_ids, diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index 47c00fa7c2faf9..bd6f90ef027acd 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -113,7 +113,7 @@ def __init__( mask_token="[MASK]", cls_token="[CLS]", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -206,7 +206,7 @@ def _decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, spaces_between_special_tokens: bool = True, - **kwargs + **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py index 6ff063e772e2da..11c3386794701d 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py +++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py @@ -124,7 +124,7 @@ def __init__( sep_token="[SEP]", mask_token="[MASK]", cls_token="[CLS]", - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py index ed9c0a42e07699..a7f198a735b385 100644 --- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py @@ -158,7 +158,7 @@ def __init__( block_size=64, num_random_blocks=3, use_bias=False, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py index 2d2efdec77418e..5a81207548f9a2 100644 --- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py +++ b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py @@ -87,7 +87,6 @@ def rename_state_dict_key(k, patterns): def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration: - cfg = BigBirdPegasusConfig(**config_update) torch_model = BigBirdPegasusForConditionalGeneration(cfg) state_dict = torch_model.state_dict() diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index ee784c9e8a6f4e..21a4c7ade40b7a 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -352,7 +352,6 @@ def bigbird_block_sparse_attention( plan_num_rand_blocks, output_attentions, ): - # BigBirdPegasus block-sparse attention as suggested in paper # ITC: @@ -1998,7 +1997,6 @@ def set_attention_type(self, value: str): @staticmethod # Copied from transformers.models.big_bird.modeling_big_bird.BigBirdModel.create_masks_for_block_sparse_attn def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): - batch_size, seq_length = attention_mask.size() if seq_length % block_size != 0: raise ValueError( @@ -2266,7 +2264,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -2291,7 +2288,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -2397,7 +2393,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: - # different to other models, BigBirdPegasus automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -2611,7 +2606,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past_key_values is used if past_key_values is not None: diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py index 56ced88dc72d88..2fe46354d291e8 100644 --- a/src/transformers/models/biogpt/configuration_biogpt.py +++ b/src/transformers/models/biogpt/configuration_biogpt.py @@ -114,7 +114,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py index bcbda452a3258d..c930a850462c82 100755 --- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py @@ -157,7 +157,6 @@ def rewrite_dict_keys(d): def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path): - # prep if not os.path.exists(biogpt_checkpoint_path): raise ValueError(f"path {biogpt_checkpoint_path} does not exist!") diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 716db23c68ac42..3fd9c823f968d3 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -556,7 +556,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -578,7 +577,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -704,7 +702,6 @@ def forward( ) def prepare_inputs_for_generation(self, input_ids, attention_mask, past_key_values=None, **kwargs): - # only last token for inputs_ids if past is defined in kwargs if past_key_values: input_ids = input_ids[:, -1].unsqueeze(-1) diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py index 405e4c8625133a..55f337f2ec9132 100644 --- a/src/transformers/models/biogpt/tokenization_biogpt.py +++ b/src/transformers/models/biogpt/tokenization_biogpt.py @@ -110,7 +110,7 @@ def __init__( eos_token="", sep_token="", pad_token="", - **kwargs + **kwargs, ): super().__init__( bos_token=bos_token, diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py index 7c1e105107e34c..278c7f1c7f1a5a 100644 --- a/src/transformers/models/bit/configuration_bit.py +++ b/src/transformers/models/bit/configuration_bit.py @@ -98,7 +98,7 @@ def __init__( output_stride=32, width_factor=1, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) if layer_type not in self.layer_types: diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py index 106c67d17e5e23..7cc7f64107ce9e 100644 --- a/src/transformers/models/bit/convert_bit_to_pytorch.py +++ b/src/transformers/models/bit/convert_bit_to_pytorch.py @@ -19,14 +19,14 @@ import json from pathlib import Path -import torch -from PIL import Image - import requests +import torch from huggingface_hub import hf_hub_download +from PIL import Image from timm import create_model from timm.data import resolve_data_config from timm.data.transforms_factory import create_transform + from transformers import BitConfig, BitForImageClassification, BitImageProcessor from transformers.image_utils import PILImageResampling from transformers.utils import logging diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py index 4395d0558455e9..374f57af4cd481 100644 --- a/src/transformers/models/bit/image_processing_bit.py +++ b/src/transformers/models/bit/image_processing_bit.py @@ -102,7 +102,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -128,7 +128,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -155,7 +155,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the @@ -179,7 +179,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -200,7 +200,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -233,7 +233,7 @@ def preprocess( do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py index ef18a3cec2e537..93ee9281364526 100644 --- a/src/transformers/models/blenderbot/configuration_blenderbot.py +++ b/src/transformers/models/blenderbot/configuration_blenderbot.py @@ -135,7 +135,7 @@ def __init__( eos_token_id=2, encoder_no_repeat_ngram_size=3, forced_eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 1f2b3a878d836c..22c647d67c8a56 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -1015,7 +1015,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1040,7 +1039,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1384,7 +1382,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py index baadfd973e69fd..629ddb99a80c5c 100644 --- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py @@ -19,11 +19,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -889,7 +888,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1448,7 +1447,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 23693b24bd6941..6b95bd56739adc 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -785,7 +785,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1101,7 +1100,7 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1219,7 +1218,7 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]: outputs = self.model( input_ids=input_ids, @@ -1456,9 +1455,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index ace4afc6d503c6..208ced46bc2db8 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -185,7 +185,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 4e65294cb9041d..7c4e060e5d2035 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -148,7 +148,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py index c056fa46bd1f07..fbc23435d66f31 100644 --- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py @@ -134,7 +134,7 @@ def __init__( bos_token_id=1, eos_token_id=2, forced_eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 633675dc56720d..0f5e5a46c95397 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -1011,7 +1011,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1036,7 +1035,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1350,7 +1348,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py index 78947481faa5f3..226e401c921ea5 100644 --- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py @@ -20,11 +20,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -887,7 +886,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1446,7 +1445,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 0ef4e1e8beef5c..3d521ea77a4d67 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -792,7 +792,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1109,9 +1108,8 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): - output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) @@ -1212,9 +1210,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]: - outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -1435,9 +1432,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index 6ef94e636aebb0..a0b45bff1dc78c 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -104,7 +104,7 @@ def __init__( eos_token="__end__", unk_token="__unk__", pad_token="__null__", - **kwargs + **kwargs, ): super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index 8dfae5894fa6c3..adc350f3d11132 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -72,7 +72,7 @@ def __init__( eos_token="<|endoftext|>", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( ByteLevelBPETokenizer( diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 04d88570e82a72..8bdff88bff2fe8 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -133,7 +133,7 @@ def __init__( sep_token_id=102, is_decoder=True, use_cache=True, - **kwargs + **kwargs, ): super().__init__( pad_token_id=pad_token_id, @@ -161,7 +161,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from BlipConfig @@ -242,7 +241,7 @@ def __init__( layer_norm_eps=1e-5, attention_dropout=0.0, initializer_range=1e-10, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -261,7 +260,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from BlipConfig @@ -334,7 +332,7 @@ def __init__( projection_dim=512, logit_scale_init_value=2.6592, image_text_hidden_size=256, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py index 9deda9c11609fa..7609b4a40e857f 100644 --- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py +++ b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py @@ -16,17 +16,17 @@ import argparse import re -import torch -from PIL import Image -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode - import requests +import torch # git clone https://github.com/salesforce/BLIP.git from models.blip import blip_decoder from models.blip_itm import blip_itm from models.blip_vqa import blip_vqa +from PIL import Image +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode + from transformers import ( BertTokenizer, BlipConfig, diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py index e9b49924ec93ec..1b7e4c6b1a6a9f 100644 --- a/src/transformers/models/blip/image_processing_blip.py +++ b/src/transformers/models/blip/image_processing_blip.py @@ -91,9 +91,8 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - **kwargs + **kwargs, ) -> None: - super().__init__(**kwargs) size = size if size is not None else {"height": 384, "width": 384} size = get_size_dict(size, default_to_square=True) @@ -114,7 +113,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -142,7 +141,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -163,7 +162,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 96e3f5f4e1ad59..7f1b3412b6845e 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -1036,7 +1036,7 @@ def generate( pixel_values: torch.FloatTensor, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.LongTensor] = None, - **generate_kwargs + **generate_kwargs, ) -> torch.LongTensor: r""" Overrides *generate* function to be able to use the model as a conditional generator @@ -1263,7 +1263,7 @@ def generate( input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, attention_mask: Optional[torch.LongTensor] = None, - **generate_kwargs + **generate_kwargs, ) -> torch.LongTensor: r""" Overrides *generate* function to be able to use the model as a conditional generator diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 7085fc2f11e147..c44cf3b0dfebeb 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -409,7 +409,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warn( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -802,7 +801,6 @@ def forward( # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 class BlipTextLMHeadModel(BlipTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index e860f6723a2611..5a9967913e4486 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -63,7 +63,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and @@ -76,7 +76,6 @@ def __call__( # Get only text if images is None: - self.current_processor = self.tokenizer text_encoding = self.tokenizer( text=text, diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 7eb4eb5322260b..74443d3e6eec3a 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -686,7 +686,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments + **deprecated_arguments, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: if deprecated_arguments.pop("position_ids", False) is not False: # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` @@ -752,12 +752,10 @@ def forward( ) for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -843,7 +841,7 @@ def prepare_inputs_for_generation( past_key_values: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - **kwargs + **kwargs, ) -> dict: # only last token for input_ids if past is not None if past_key_values: @@ -886,7 +884,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments + **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1016,7 +1014,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments + **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1152,7 +1150,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **deprecated_arguments + **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 1d6f405039a80b..1c8efb10cb6c46 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -113,7 +113,7 @@ def __init__( eos_token="", pad_token="", add_prefix_space=False, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py index 933c88795ab94a..4753f593da19b2 100644 --- a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py +++ b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py @@ -18,17 +18,17 @@ import argparse import os -import numpy as np -import torch -from packaging import version -from torch import nn - import gluonnlp as nlp import mxnet as mx +import numpy as np +import torch from gluonnlp.base import get_home_dir from gluonnlp.model.bert import BERTEncoder from gluonnlp.model.utils import _load_vocab from gluonnlp.vocab import Vocab +from packaging import version +from torch import nn + from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer from transformers.models.bert.modeling_bert import ( BertIntermediate, diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index a88767a2172849..c04b01a2ea713c 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -88,7 +88,7 @@ def __init__( stop_gradient=False, share_layernorm=True, remove_last_layer=False, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.hidden_size = hidden_size @@ -207,7 +207,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -317,7 +317,7 @@ def __init__( init_layernorm_from_vision_encoder=False, text_config=None, vision_config=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 8e8be8373c7ae4..fdea55396ba446 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -170,7 +170,7 @@ def __init__( image_std: Optional[Union[float, List[float]]] = None, do_center_crop: bool = True, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -199,7 +199,7 @@ def resize( size_divisor: int = 32, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -234,7 +234,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -254,7 +254,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any @@ -278,7 +278,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index f520a221f84a51..aa25ad52d7edf8 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -768,7 +768,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index a92932c041e0ba..c268d7c26f43d9 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -63,7 +63,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py index 0071d7a9afe4ed..59e694c343c559 100644 --- a/src/transformers/models/byt5/tokenization_byt5.py +++ b/src/transformers/models/byt5/tokenization_byt5.py @@ -67,7 +67,7 @@ def __init__( pad_token="", extra_ids=125, additional_special_tokens=None, - **kwargs + **kwargs, ) -> None: # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py index 09989f1cb85f4e..d712726492ae18 100644 --- a/src/transformers/models/camembert/configuration_camembert.py +++ b/src/transformers/models/camembert/configuration_camembert.py @@ -126,7 +126,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index e7755dd0305324..2a7b4ecbfac16f 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -515,7 +515,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py index 29f435bdb194b2..5142b3d82b04cb 100644 --- a/src/transformers/models/camembert/modeling_tf_camembert.py +++ b/src/transformers/models/camembert/modeling_tf_camembert.py @@ -729,7 +729,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index f5988fd9d784ec..658dd1080b7122 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -129,7 +129,7 @@ def __init__( mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py index cbffa75a28b744..8a5ebbedd1c7be 100644 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ b/src/transformers/models/camembert/tokenization_camembert_fast.py @@ -120,7 +120,7 @@ def __init__( pad_token="", mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py index b75ab9cc42b9a0..1fdeb3204a52e4 100644 --- a/src/transformers/models/canine/configuration_canine.py +++ b/src/transformers/models/canine/configuration_canine.py @@ -112,7 +112,7 @@ def __init__( num_hash_functions=8, num_hash_buckets=16384, local_transformer_stride=128, # Good TPU/XLA memory alignment. - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py index 15b7b6c32ae515..5d50050d039687 100644 --- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py @@ -25,7 +25,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path): - # Initialize PyTorch model config = CanineConfig() model = CanineModel(config) diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 9cabe705b4486a..a91d42f0395ee8 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -312,7 +312,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) def forward(self, char_encoding: torch.Tensor) -> torch.Tensor: - # `cls_encoding`: [batch, 1, hidden_size] cls_encoding = char_encoding[:, 0:1, :] diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py index fba01e03c0e871..2fae9e1482bd32 100644 --- a/src/transformers/models/canine/tokenization_canine.py +++ b/src/transformers/models/canine/tokenization_canine.py @@ -86,7 +86,7 @@ def __init__( mask_token=chr(MASK), add_prefix_space=False, model_max_length=2048, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 9f581a087a0d38..f20e16e41cac64 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -122,7 +122,7 @@ def __init__( pad_token_id=0, position_embedding_type="absolute", use_cache=True, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -144,7 +144,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from ChineseCLIPConfig @@ -228,7 +227,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -248,7 +247,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from ChineseCLIPConfig @@ -402,7 +400,6 @@ def generate_dummy_inputs( seq_length: int = -1, framework: Optional["TensorType"] = None, ) -> Mapping[str, Any]: - text_input_dict = super().generate_dummy_inputs( processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework ) diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py index 6016c51b376ca8..02c4b7b754b295 100644 --- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py @@ -21,7 +21,6 @@ def copy_attn_layer(hf_attn_layer, pt_weights, prefix): - q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0) q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0) diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py index 848bc086a1f086..18a80e21af6cfc 100644 --- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -102,7 +102,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -128,7 +128,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -155,7 +155,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the @@ -177,7 +177,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -198,7 +198,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -231,7 +231,7 @@ def preprocess( do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 527a8a09113cc5..ce6a283a05b3ed 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -900,7 +900,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 84dec5d71a58c7..1295245993110b 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -108,7 +108,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -127,7 +127,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from CLIPConfig @@ -211,7 +210,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -231,7 +230,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from CLIPConfig @@ -383,7 +381,6 @@ def generate_dummy_inputs( seq_length: int = -1, framework: Optional["TensorType"] = None, ) -> Mapping[str, Any]: - text_input_dict = super().generate_dummy_inputs( processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework ) diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py index 58886aa88a3440..0033be274d5c13 100644 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py @@ -16,8 +16,8 @@ import argparse import torch - from clip import load + from transformers import CLIPConfig, CLIPModel diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py index ac99feb54dd9ae..325f1c3991817d 100644 --- a/src/transformers/models/clip/image_processing_clip.py +++ b/src/transformers/models/clip/image_processing_clip.py @@ -102,7 +102,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -128,7 +128,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -155,7 +155,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the @@ -179,7 +179,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -200,7 +200,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -233,7 +233,7 @@ def preprocess( do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py index ea4ff88a2c6b23..cb8ee4e7c9a444 100644 --- a/src/transformers/models/clip/modeling_flax_clip.py +++ b/src/transformers/models/clip/modeling_flax_clip.py @@ -593,7 +593,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -673,7 +673,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): if input_shape is None: input_shape = (1, config.image_size, config.image_size, 3) @@ -744,7 +744,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): if input_shape is None: input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3)) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index 9ea7e7690428a8..680ea91ca1383a 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -151,7 +151,6 @@ def __init__(self, config: CLIPVisionConfig, **kwargs): ) def build(self, input_shape: tf.TensorShape): - factor = self.config.initializer_factor self.class_embedding = self.add_weight( @@ -205,7 +204,6 @@ def __init__(self, config: CLIPTextConfig, **kwargs): self.config = config def build(self, input_shape: tf.TensorShape): - with tf.name_scope("token_embedding"): self.weight = self.add_weight( shape=(self.config.vocab_size, self.embed_dim), @@ -381,7 +379,6 @@ def __init__(self, config: CLIPConfig, **kwargs): ) def call(self, hidden_states: tf.Tensor) -> tf.Tensor: - hidden_states = self.fc1(inputs=hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(inputs=hidden_states) @@ -644,7 +641,6 @@ def call( return_dict: bool, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - embedding_output = self.embeddings(pixel_values=pixel_values) embedding_output = self.pre_layernorm(inputs=embedding_output) @@ -694,7 +690,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -753,7 +748,6 @@ def __init__(self, config: CLIPConfig, **kwargs): ) def build(self, input_shape: tf.TensorShape): - self.logit_scale = self.add_weight( shape=(1,), initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), @@ -774,7 +768,6 @@ def get_text_features( return_dict: Optional[bool] = None, training: bool = False, ) -> tf.Tensor: - if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -836,7 +829,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]: - if input_ids is None: raise ValueError("You have to specify either input_ids") if pixel_values is None: diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index ef8da45cda2afd..e3ff5f8626fa6f 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -295,7 +295,7 @@ def __init__( bos_token="<|startoftext|>", eos_token="<|endoftext|>", pad_token="<|endoftext|>", # hack to enable padding - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py index df11bf793f0871..75b3e4f4078053 100644 --- a/src/transformers/models/clip/tokenization_clip_fast.py +++ b/src/transformers/models/clip/tokenization_clip_fast.py @@ -84,7 +84,7 @@ def __init__( bos_token="<|startoftext|>", eos_token="<|endoftext|>", pad_token="<|endoftext|>", # hack to enable padding - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 33a9330e1f4f48..1910c946325ae4 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -99,7 +99,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -117,7 +117,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from CLIPSegConfig @@ -200,7 +199,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -219,7 +218,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from CLIPSegConfig @@ -316,7 +314,7 @@ def __init__( decoder_intermediate_size=2048, conditional_layer=0, use_complex_transposed_convolution=False, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py index 778dbca2996782..183bb93b9e2b75 100644 --- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py +++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py @@ -17,10 +17,10 @@ import argparse +import requests import torch from PIL import Image -import requests from transformers import ( CLIPSegConfig, CLIPSegForImageSegmentation, diff --git a/src/transformers/models/codegen/configuration_codegen.py b/src/transformers/models/codegen/configuration_codegen.py index aed1249f86b0b1..1a1e609f0111fb 100644 --- a/src/transformers/models/codegen/configuration_codegen.py +++ b/src/transformers/models/codegen/configuration_codegen.py @@ -124,7 +124,7 @@ def __init__( bos_token_id=50256, eos_token_id=50256, tie_word_embeddings=False, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 92a7d9506cd9a2..87a2a986c82055 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -147,7 +147,6 @@ def _attn( attention_mask=None, head_mask=None, ): - # compute causal mask from causal mask buffer query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length] @@ -193,7 +192,6 @@ def forward( Tuple[torch.Tensor, Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], ]: - qkv = self.qkv_proj(hidden_states) # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic mp_num = 4 @@ -545,12 +543,10 @@ def forward( all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index ff86eee8231ab6..c09a816bfbab5c 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -21,7 +21,6 @@ from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np - import regex as re from ...utils import is_tf_available, is_torch_available, logging @@ -160,7 +159,7 @@ def __init__( pad_token=None, add_prefix_space=False, add_bos_token=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -321,7 +320,7 @@ def decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, truncate_before_pattern: Optional[List[str]] = None, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py index 1c3dcf85fb5ea1..332f0ed934acad 100644 --- a/src/transformers/models/codegen/tokenization_codegen_fast.py +++ b/src/transformers/models/codegen/tokenization_codegen_fast.py @@ -126,7 +126,7 @@ def __init__( bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, - **kwargs + **kwargs, ): super().__init__( vocab_file, @@ -187,7 +187,7 @@ def decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, truncate_before_pattern: Optional[List[str]] = None, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py index 08fd5f08357153..ec04f0a52369fa 100644 --- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py +++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py @@ -178,7 +178,7 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, focal_alpha=0.25, - **kwargs + **kwargs, ): if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") @@ -240,7 +240,6 @@ def hidden_size(self) -> int: class ConditionalDetrOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py index a4e28cbb558a30..083b3c681ec22d 100644 --- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py @@ -20,11 +20,11 @@ from collections import OrderedDict from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( ConditionalDetrConfig, ConditionalDetrFeatureExtractor, diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 4f161bbb6ec58b..d4e2f9dd5f3ac2 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -788,7 +788,7 @@ def __init__( image_mean: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -901,7 +901,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[ChannelDimension] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an @@ -1089,7 +1089,7 @@ def preprocess( format: Optional[Union[str, AnnotionFormat]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Preprocess an image or a batch of images so that it can be used by the model. diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index bf3fb9994727d0..cbee21aafe3a52 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -133,7 +133,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -181,7 +181,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py index 65c37a9b0927e8..07447bb6a17caa 100644 --- a/src/transformers/models/convbert/tokenization_convbert_fast.py +++ b/src/transformers/models/convbert/tokenization_convbert_fast.py @@ -110,7 +110,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py index 41562cbcd44efa..d4807bc5741a02 100644 --- a/src/transformers/models/convnext/configuration_convnext.py +++ b/src/transformers/models/convnext/configuration_convnext.py @@ -97,7 +97,7 @@ def __init__( drop_path_rate=0.0, image_size=224, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -125,7 +125,6 @@ def __init__( class ConvNextOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py index e40565c7a691cd..69c300eee40406 100644 --- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py +++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py @@ -21,11 +21,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ConvNextConfig, ConvNextFeatureExtractor, ConvNextForImageClassification from transformers.utils import logging diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py index 2353767df53ab4..a46bdcfef75b77 100644 --- a/src/transformers/models/convnext/image_processing_convnext.py +++ b/src/transformers/models/convnext/image_processing_convnext.py @@ -99,7 +99,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 384} @@ -123,7 +123,7 @@ def resize( crop_pct: float, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -166,7 +166,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -187,7 +187,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py index bf2ec8f7c4511f..f509519271d4e8 100644 --- a/src/transformers/models/cpm/tokenization_cpm.py +++ b/src/transformers/models/cpm/tokenization_cpm.py @@ -53,7 +53,7 @@ def __init__( mask_token="", additional_special_tokens=["", ""], sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: """ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py index 032aebcf5b1ede..31a2aaa9f1d848 100644 --- a/src/transformers/models/cpm/tokenization_cpm_fast.py +++ b/src/transformers/models/cpm/tokenization_cpm_fast.py @@ -53,7 +53,7 @@ def __init__( cls_token="", mask_token="", additional_special_tokens=["", ""], - **kwargs + **kwargs, ): """ Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py index d207c3fa4b2fa2..0a1feed58b24db 100644 --- a/src/transformers/models/ctrl/configuration_ctrl.py +++ b/src/transformers/models/ctrl/configuration_ctrl.py @@ -97,7 +97,7 @@ def __init__( layer_norm_epsilon=1e-6, initializer_range=0.02, use_cache=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.n_positions = n_positions diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py index 116bb4ca665bdd..dcd3f5a03e0c92 100644 --- a/src/transformers/models/ctrl/modeling_tf_ctrl.py +++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py @@ -269,7 +269,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[Tuple, TFBaseModelOutputWithPast]: - # If using past key value states, only the last tokens # should be given as an input if past_key_values is not None: diff --git a/src/transformers/models/cvt/configuration_cvt.py b/src/transformers/models/cvt/configuration_cvt.py index 0ab32857d49608..a540c0f4807cca 100644 --- a/src/transformers/models/cvt/configuration_cvt.py +++ b/src/transformers/models/cvt/configuration_cvt.py @@ -121,7 +121,7 @@ def __init__( stride_q=[1, 1, 1], initializer_range=0.02, layer_norm_eps=1e-12, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.num_channels = num_channels diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py index 1c9f58f4a68d5d..e84c61d6aad644 100644 --- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py @@ -22,8 +22,8 @@ from collections import OrderedDict import torch - from huggingface_hub import cached_download, hf_hub_url + from transformers import AutoFeatureExtractor, CvtConfig, CvtForImageClassification diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 220e40396c9b8b..99e3a02febf4d2 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -212,7 +212,7 @@ def __init__( qkv_bias, attention_drop_rate, with_cls_token=True, - **kwargs + **kwargs, ): super().__init__() self.scale = embed_dim**-0.5 @@ -616,7 +616,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithCLSToken]: - output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py index c82cade5247982..52cc6585a7a552 100644 --- a/src/transformers/models/cvt/modeling_tf_cvt.py +++ b/src/transformers/models/cvt/modeling_tf_cvt.py @@ -109,7 +109,7 @@ def __init__( stride: int, padding: int, dropout_rate: float, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.convolution_embeddings = TFCvtConvEmbeddings( @@ -211,7 +211,7 @@ def __init__( stride: int, padding: int, projection_method: str = "dw_bn", - **kwargs + **kwargs, ): super().__init__(**kwargs) if projection_method == "dw_bn": @@ -246,7 +246,7 @@ def __init__( qkv_bias: bool, attention_drop_rate: float, with_cls_token: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.scale = embed_dim**-0.5 @@ -470,7 +470,7 @@ def __init__( mlp_ratio: float, drop_path_rate: float, with_cls_token: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.attention = TFCvtAttention( diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py index c1aee468321482..2ec526924f36eb 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_audio.py +++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py @@ -211,7 +211,7 @@ def __init__( adapter_stride=2, num_adapter_layers=3, output_hidden_size=None, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py index a990e933bc6762..305a3ea5e4ffa4 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_text.py +++ b/src/transformers/models/data2vec/configuration_data2vec_text.py @@ -117,7 +117,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/data2vec/configuration_data2vec_vision.py b/src/transformers/models/data2vec/configuration_data2vec_vision.py index a63c9429c3382f..b45f8420ca0008 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_vision.py +++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py @@ -142,7 +142,7 @@ def __init__( auxiliary_num_convs=1, auxiliary_concat_input=False, semantic_loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -180,7 +180,6 @@ def __init__( # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig class Data2VecVisionOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py index 7777e85927cdaa..8ff8c1b910f0b4 100755 --- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py @@ -3,10 +3,10 @@ import json import torch -from PIL import Image - from huggingface_hub import hf_hub_download +from PIL import Image from timm.models import create_model + from transformers import ( BeitFeatureExtractor, Data2VecVisionConfig, diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 8ad1c9f0e9fb77..6cda1b869bca68 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1049,7 +1049,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index b3c5cba4f226c1..737209142cea40 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -501,7 +501,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index c1b85c7f093265..42a1edcb6493ca 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -150,7 +150,6 @@ def __init__(self, config: Data2VecVisionConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor: - embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py index cfb81ceb396e05..c837670b1a1f69 100644 --- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py @@ -165,7 +165,6 @@ def build(self, input_shape: tf.TensorShape): super().build(input_shape) def call(self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor: - embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, projection_dim = shape_list(embeddings) @@ -909,7 +908,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]: - outputs = self.data2vec_vision( pixel_values=pixel_values, bool_masked_pos=bool_masked_pos, @@ -1027,7 +1025,7 @@ def __init__( padding: str = "valid", bias: bool = False, dilation: Union[int, Tuple[int, int]] = 1, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) self.conv = tf.keras.layers.Conv2D( @@ -1261,7 +1259,7 @@ def __init__( in_index: int = 2, kernel_size: int = 3, dilation: Union[int, Tuple[int, int]] = 1, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) self.in_channels = config.hidden_size diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py index ec00f0eccb2d65..94ea91cd3a0888 100644 --- a/src/transformers/models/deberta/configuration_deberta.py +++ b/src/transformers/models/deberta/configuration_deberta.py @@ -128,7 +128,7 @@ def __init__( pos_att_type=None, pooler_dropout=0, pooler_hidden_act="gelu", - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index ecfd0c53e6a6d5..2b63b428bbe9b2 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -454,7 +454,6 @@ def forward( next_kv = hidden_states rel_embeddings = self.get_rel_embedding() for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py index c86b900042c059..016ce15db61825 100644 --- a/src/transformers/models/deberta/modeling_tf_deberta.py +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -92,7 +92,6 @@ def __init__(self, axis=-1, **kwargs): self.axis = axis def call(self, inputs: tf.Tensor, mask: tf.Tensor): - rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) output = stable_softmax(output, self.axis) @@ -352,7 +351,6 @@ def call( rel_embeddings = self.get_rel_embedding() for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -659,7 +657,6 @@ def linear(w, b, x): return outputs def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): - if relative_pos is None: q = shape_list(query_layer)[-2] relative_pos = build_relative_position(q, shape_list(key_layer)[-2]) @@ -944,7 +941,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py index bbddb00a2682c9..bcaaaa4421178f 100644 --- a/src/transformers/models/deberta/tokenization_deberta.py +++ b/src/transformers/models/deberta/tokenization_deberta.py @@ -191,7 +191,7 @@ def __init__( mask_token="[MASK]", add_prefix_space=False, add_bos_token=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py index f708de1636216f..959bcae470112a 100644 --- a/src/transformers/models/deberta/tokenization_deberta_fast.py +++ b/src/transformers/models/deberta/tokenization_deberta_fast.py @@ -154,9 +154,8 @@ def __init__( pad_token="[PAD]", mask_token="[MASK]", add_prefix_space=False, - **kwargs + **kwargs, ): - super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py index 3e7d0d97fe6f05..d55486cd563381 100644 --- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py +++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py @@ -130,7 +130,7 @@ def __init__( pos_att_type=None, pooler_dropout=0, pooler_hidden_act="gelu", - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 7d73a49ef9281f..ef04a24b2f63e1 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -498,7 +498,6 @@ def forward( rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (output_states,) diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index 885212be7389be..015eb392574087 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -93,7 +93,6 @@ def __init__(self, axis=-1, **kwargs): self.axis = axis def call(self, inputs: tf.Tensor, mask: tf.Tensor): - rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) output = stable_softmax(output, self.axis) @@ -416,7 +415,6 @@ def call( rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (output_states,) @@ -713,7 +711,6 @@ def call( return outputs def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): - if relative_pos is None: q = shape_list(query_layer)[-2] relative_pos = build_relative_position( @@ -1036,7 +1033,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index fc259dd7d5eec9..b2a0d844f1625d 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -120,7 +120,7 @@ def __init__( cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py index 32ccd84862fa86..3f2a90cfa83ea9 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py @@ -128,7 +128,7 @@ def __init__( pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", - **kwargs + **kwargs, ) -> None: super().__init__( vocab_file, diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py index 17c81c767c5c24..91ef58665a1f18 100644 --- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py +++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py @@ -134,7 +134,6 @@ def __init__( reorder_and_upcast_attn=False, **kwargs, ): - self.state_dim = state_dim self.act_dim = act_dim self.hidden_size = hidden_size diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 5008c7ed7e9c5b..1f5f7601229d32 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -288,8 +288,8 @@ def forward( if encoder_hidden_states is not None: if not hasattr(self, "q_attn"): raise ValueError( - "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to" - " instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`." + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`." ) query = self.q_attn(hidden_states) @@ -612,7 +612,6 @@ def forward( all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) @@ -628,7 +627,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 253d521aa5de7a..d00b71fab811d6 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -193,7 +193,7 @@ def __init__( giou_loss_coefficient=2, eos_coefficient=0.1, focal_alpha=0.25, - **kwargs + **kwargs, ): if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py index 8e4461d515c2ea..d1fd8bcbe46677 100644 --- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py +++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import cached_download, hf_hub_url from PIL import Image -import requests -from huggingface_hub import cached_download, hf_hub_url from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection from transformers.utils import logging diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 5b23c05c085c9d..3601a2aad11f68 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -786,7 +786,7 @@ def __init__( image_mean: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -899,7 +899,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[ChannelDimension] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an @@ -1087,7 +1087,7 @@ def preprocess( format: Optional[Union[str, AnnotionFormat]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Preprocess an image or a batch of images so that it can be used by the model. diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index c675e793211ce5..a7ee782501a2ab 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1172,7 +1172,6 @@ def get_reference_points(spatial_shapes, valid_ratios, device): """ reference_points_list = [] for level, (height, width) in enumerate(spatial_shapes): - ref_y, ref_x = meshgrid( torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py index 8fbba3e9be08d7..b395afdbef5cf3 100644 --- a/src/transformers/models/deit/configuration_deit.py +++ b/src/transformers/models/deit/configuration_deit.py @@ -109,7 +109,7 @@ def __init__( num_channels=3, qkv_bias=True, encoder_stride=16, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py index 8a8a394c3f8103..a91702106e0f8c 100644 --- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py +++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py @@ -19,12 +19,12 @@ import json from pathlib import Path -import torch -from PIL import Image - import requests import timm +import torch from huggingface_hub import hf_hub_download +from PIL import Image + from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher from transformers.utils import logging diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py index 2c0ad59fa57f5b..77d7d2bb2ca2e5 100644 --- a/src/transformers/models/deit/image_processing_deit.py +++ b/src/transformers/models/deit/image_processing_deit.py @@ -91,7 +91,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 256, "width": 256} @@ -116,7 +116,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PIL.Image.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])` using the specified resampling filter. @@ -143,7 +143,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than @@ -167,7 +167,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -188,7 +188,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index ca849f230e6c5e..f05b16efe7a045 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -204,7 +204,6 @@ def __init__(self, config: DeiTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -262,7 +261,6 @@ def __init__(self, config: DeiTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py index 14e2db1580b61d..06b4d3f892377e 100644 --- a/src/transformers/models/deta/configuration_deta.py +++ b/src/transformers/models/deta/configuration_deta.py @@ -174,7 +174,7 @@ def __init__( giou_loss_coefficient=2, eos_coefficient=0.1, focal_alpha=0.25, - **kwargs + **kwargs, ): if backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") diff --git a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py index 84672301034982..cc17568bd64133 100644 --- a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py +++ b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py @@ -21,11 +21,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import cached_download, hf_hub_download, hf_hub_url from PIL import Image -import requests -from huggingface_hub import cached_download, hf_hub_download, hf_hub_url from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor from transformers.utils import logging diff --git a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py index 46ed720e24dafd..911bc434e14265 100644 --- a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py +++ b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py @@ -21,11 +21,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import cached_download, hf_hub_download, hf_hub_url from PIL import Image -import requests -from huggingface_hub import cached_download, hf_hub_download, hf_hub_url from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig from transformers.utils import logging diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index 1ef7700975229b..717fbfdd540a97 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -491,7 +491,7 @@ def __init__( image_mean: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -568,7 +568,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[ChannelDimension] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an @@ -746,7 +746,7 @@ def preprocess( format: Optional[Union[str, AnnotionFormat]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Preprocess an image or a batch of images so that it can be used by the model. diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index 750bf114f5b6b0..eb77604fbfc4ee 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -1069,7 +1069,6 @@ def get_reference_points(spatial_shapes, valid_ratios, device): """ reference_points_list = [] for level, (height, width) in enumerate(spatial_shapes): - ref_y, ref_x = meshgrid( torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index e730c833ff12fe..430efc913b37c3 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -174,7 +174,7 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, eos_coefficient=0.1, - **kwargs + **kwargs, ): if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") @@ -235,7 +235,6 @@ def hidden_size(self) -> int: class DetrOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py index a03642c316448a..b6dcc617da7b62 100644 --- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py @@ -20,11 +20,11 @@ from collections import OrderedDict from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation from transformers.utils import logging diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py index 0ef750f0e4fb2e..3ff2e38ac38325 100644 --- a/src/transformers/models/detr/convert_detr_to_pytorch.py +++ b/src/transformers/models/detr/convert_detr_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig from transformers.utils import logging @@ -60,7 +60,6 @@ def get_detr_config(model_name): def create_rename_keys(config): - # here we list all keys to be renamed (original name on the left, our name on the right) rename_keys = [] diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 4b24ad42c7ef3d..f03465954e8de4 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -770,7 +770,7 @@ def __init__( image_mean: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -875,7 +875,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[ChannelDimension] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an @@ -1055,7 +1055,7 @@ def preprocess( format: Optional[Union[str, AnnotionFormat]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Preprocess an image or a batch of images so that it can be used by the model. diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py index 8348d1beb9fe9f..e4ba8f940debe4 100644 --- a/src/transformers/models/dinat/configuration_dinat.py +++ b/src/transformers/models/dinat/configuration_dinat.py @@ -117,7 +117,7 @@ def __init__( layer_norm_eps=1e-5, layer_scale_init_value=0.0, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 7176fae899d0f2..ef19005834ac12 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -337,7 +337,6 @@ def forward( hidden_states: torch.Tensor, output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor]: - query_layer = self.transpose_for_scores(self.query(hidden_states)) key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py index b36917bc07580b..3dabb3d3e2340e 100644 --- a/src/transformers/models/distilbert/configuration_distilbert.py +++ b/src/transformers/models/distilbert/configuration_distilbert.py @@ -121,7 +121,7 @@ def __init__( qa_dropout=0.1, seq_classif_dropout=0.2, pad_token_id=0, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 214c5d916621ea..535c7372382dd7 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -329,7 +329,6 @@ def forward( output_hidden_states: bool = False, return_dict: Optional[bool] = None, ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]: # docstyle-ignore - """ Parameters: x: torch.tensor(bs, seq_length, dim) Input sequence embedded. diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py index 984931bb4003ea..24e2c7e3987e07 100644 --- a/src/transformers/models/distilbert/modeling_flax_distilbert.py +++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py @@ -16,11 +16,10 @@ import math from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.traverse_util import flatten_dict, unflatten_dict from jax import lax @@ -200,7 +199,6 @@ def __call__( deterministic: bool = True, output_attentions: bool = False, ): - bs, q_len, dim = query.shape k_len = key.shape[1] # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured' @@ -429,7 +427,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index 612b96c83da38a..76582ae4eab1cd 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -147,7 +147,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -199,7 +199,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py index 67763ad36e943b..dd9dcd165d4109 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py +++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -140,7 +140,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py index 7503f035eacc28..d80e9d890ddc30 100644 --- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py +++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling from transformers.image_utils import PILImageResampling from transformers.utils import logging diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py index d3316bdc79f685..02bd0f72ecb93c 100644 --- a/src/transformers/models/donut/configuration_donut_swin.py +++ b/src/transformers/models/donut/configuration_donut_swin.py @@ -113,7 +113,7 @@ def __init__( patch_norm=True, initializer_range=0.02, layer_norm_eps=1e-5, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py index 507f10cb776cf0..cd0f43773567f1 100644 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ b/src/transformers/models/donut/convert_donut_to_pytorch.py @@ -18,8 +18,8 @@ import torch from datasets import load_dataset - from donut import DonutModel + from transformers import ( DonutFeatureExtractor, DonutProcessor, diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index 835fdb9f64a166..325a2bb9b60215 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -102,7 +102,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) @@ -207,7 +207,7 @@ def thumbnail( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any @@ -248,7 +248,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -275,7 +275,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -296,7 +296,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -330,7 +330,7 @@ def preprocess( image_std: Optional[Union[float, List[float]]] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 9c5ab57dc4e86b..bb9e863a36c02a 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -675,7 +675,6 @@ def forward( ) -> Tuple[torch.Tensor]: height, width = input_dimensions for i, layer_module in enumerate(self.blocks): - layer_head_mask = head_mask[i] if head_mask is not None else None layer_outputs = layer_module( diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py index cfbf296994b739..5551883e09645e 100644 --- a/src/transformers/models/dpr/configuration_dpr.py +++ b/src/transformers/models/dpr/configuration_dpr.py @@ -126,7 +126,7 @@ def __init__( pad_token_id=0, position_embedding_type="absolute", projection_dim: int = 0, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index 471376322eda25..a551e507300b0d 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -170,7 +170,6 @@ def _set_gradient_checkpointing(self, module, value=False): class DPREncoder(DPRPreTrainedModel): - base_model_prefix = "bert_model" def __init__(self, config: DPRConfig): @@ -227,7 +226,6 @@ def embeddings_size(self) -> int: class DPRSpanPredictor(DPRPreTrainedModel): - base_model_prefix = "encoder" def __init__(self, config: DPRConfig): diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index 28aba892e9b8f6..565ad37b2117e8 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -146,7 +146,6 @@ class TFDPRReaderOutput(ModelOutput): class TFDPREncoderLayer(tf.keras.layers.Layer): - base_model_prefix = "bert_model" def __init__(self, config: DPRConfig, **kwargs): @@ -210,7 +209,6 @@ def embeddings_size(self) -> int: class TFDPRSpanPredictorLayer(tf.keras.layers.Layer): - base_model_prefix = "encoder" def __init__(self, config: DPRConfig, **kwargs): @@ -275,7 +273,6 @@ def call( class TFDPRSpanPredictor(TFPreTrainedModel): - base_model_prefix = "encoder" def __init__(self, config: DPRConfig, **kwargs): diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py index 7cd01a18fc06db..a14133459b7e83 100644 --- a/src/transformers/models/dpr/tokenization_dpr.py +++ b/src/transformers/models/dpr/tokenization_dpr.py @@ -230,7 +230,7 @@ def __call__( max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_attention_mask: Optional[bool] = None, - **kwargs + **kwargs, ) -> BatchEncoding: if titles is None and texts is None: return super().__call__( diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py index 280f856a174ba0..507cd2bc40bcea 100644 --- a/src/transformers/models/dpr/tokenization_dpr_fast.py +++ b/src/transformers/models/dpr/tokenization_dpr_fast.py @@ -231,7 +231,7 @@ def __call__( max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_attention_mask: Optional[bool] = None, - **kwargs + **kwargs, ) -> BatchEncoding: if titles is None and texts is None: return super().__call__( diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 034f1b7e166ec2..7f2dd2e807b702 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -152,7 +152,7 @@ def __init__( backbone_featmap_shape=[1, 1024, 24, 24], neck_ignore_stages=[0, 1], backbone_config=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py index bccc82bb2b3683..a563436b13c874 100644 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import cached_download, hf_hub_url from PIL import Image -import requests -from huggingface_hub import cached_download, hf_hub_url from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation from transformers.utils import logging diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py index dc26d017d73644..7ef5f7cf119a1b 100644 --- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import cached_download, hf_hub_url from PIL import Image -import requests -from huggingface_hub import cached_download, hf_hub_url from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation from transformers.utils import logging diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py index 2bc57c9a2afe07..d6bcfe9c5e3d13 100644 --- a/src/transformers/models/dpt/image_processing_dpt.py +++ b/src/transformers/models/dpt/image_processing_dpt.py @@ -135,7 +135,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 384, "width": 384} @@ -159,7 +159,7 @@ def resize( ensure_multiple_of: int = 1, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image @@ -199,7 +199,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -220,7 +220,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 87f8d026891df3..187a6c36656a8e 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -379,7 +379,6 @@ def __init__(self, config: DPTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -438,7 +437,6 @@ def __init__(self, config: DPTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py index 6a2bc31eedb28e..5f30664ff325a0 100644 --- a/src/transformers/models/efficientformer/configuration_efficientformer.py +++ b/src/transformers/models/efficientformer/configuration_efficientformer.py @@ -136,7 +136,7 @@ def __init__( hidden_act: str = "gelu", initializer_range: float = 0.02, layer_norm_eps: float = 1e-12, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py index 342f1263d3a811..6f7f1b60669f05 100644 --- a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py @@ -22,11 +22,11 @@ import re from pathlib import Path +import requests import torch from PIL import Image from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor -import requests from transformers import ( EfficientFormerConfig, EfficientFormerForImageClassificationWithTeacher, diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py index 963946f26688ef..5694fb166e3c76 100644 --- a/src/transformers/models/efficientformer/image_processing_efficientformer.py +++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py @@ -96,7 +96,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 224, "width": 224} @@ -121,7 +121,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. @@ -158,7 +158,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the @@ -205,7 +205,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py index 01a6d1165a2e78..d8e1de0fc97fa4 100644 --- a/src/transformers/models/electra/configuration_electra.py +++ b/src/transformers/models/electra/configuration_electra.py @@ -155,7 +155,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 22114e28ef7360..8801b0de9a9de5 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -562,7 +562,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py index 093a6e69acf059..863ba86d778999 100644 --- a/src/transformers/models/electra/modeling_flax_electra.py +++ b/src/transformers/models/electra/modeling_flax_electra.py @@ -15,12 +15,11 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -692,7 +691,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -786,7 +785,6 @@ def __call__( return_dict: Optional[bool] = None, past_key_values: dict = None, ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index f81bb8f3bf9da8..673c1db6119e4c 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -150,7 +150,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -198,7 +198,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py index 894f41df179225..cf92dd01714f9d 100644 --- a/src/transformers/models/electra/tokenization_electra_fast.py +++ b/src/transformers/models/electra/tokenization_electra_fast.py @@ -143,7 +143,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 9f34b8c67e42b9..b27b134ecf29dc 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -384,7 +384,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, - **kwargs + **kwargs, ) -> PreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py index 36df84f3055341..a500398d67a992 100644 --- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py @@ -317,7 +317,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): if input_shape is None: input_shape = ((1, 1), (1, 1)) @@ -582,7 +582,6 @@ def decode( def _decoder_forward( module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs ): - projection_module = module._get_projection_module() decoder_module = module._get_decoder_module() @@ -726,7 +725,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape @@ -763,7 +762,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, *model_args, - **kwargs + **kwargs, ) -> FlaxPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py index c6a8fb0f35c5c0..a9441c32290385 100644 --- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py @@ -157,7 +157,6 @@ def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): - if pad_token_id is None: raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") pad_token_id = tf.cast(pad_token_id, input_ids.dtype) @@ -348,7 +347,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, - **kwargs + **kwargs, ) -> TFPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model @@ -573,7 +572,6 @@ def call( ) if encoder_outputs is None: - encoder_inputs = { "input_ids": input_ids, "attention_mask": attention_mask, diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py index 71dfde96dafbac..91253ab1384bcc 100644 --- a/src/transformers/models/ernie/configuration_ernie.py +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -131,7 +131,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 631dc9f3fadc2c..25e9e2e251d301 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -497,7 +497,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index 6b961b788829b7..e51c5d01f1558c 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -120,7 +120,7 @@ def __init__( is_folding_model=False, esmfold_config=None, vocab_list=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs) diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py index 996e9eaeed4fe0..6ac74d40e6f957 100644 --- a/src/transformers/models/esm/convert_esm.py +++ b/src/transformers/models/esm/convert_esm.py @@ -20,11 +20,11 @@ from pathlib import Path from tempfile import TemporaryDirectory -import torch - import esm as esm_module +import torch from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences from esm.esmfold.v1.pretrained import esmfold_v1 + from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig from transformers.models.esm.modeling_esm import ( EsmForMaskedLM, diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index dc25903743bb4d..56544f4ca62bc4 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -295,7 +295,6 @@ def forward( past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor]: - mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys @@ -597,7 +596,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py index 27476c61b012cb..2bb25ba94d30c5 100644 --- a/src/transformers/models/esm/modeling_tf_esm.py +++ b/src/transformers/models/esm/modeling_tf_esm.py @@ -329,7 +329,6 @@ def call( output_attentions: Optional[bool] = False, training: bool = False, ) -> Tuple[tf.Tensor]: - mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys @@ -830,7 +829,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py index 4f68503e99bb68..4b60373438e2d7 100644 --- a/src/transformers/models/esm/openfold_utils/chunk_utils.py +++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py @@ -62,6 +62,7 @@ def _get_minimal_slice_set( end is INCLUSIVE. """ + # start_edges and end_edges both indicate whether, starting from any given # dimension, the start/end index is at the top/bottom edge of the # corresponding tensor, modeled as a tree diff --git a/src/transformers/models/esm/openfold_utils/loss.py b/src/transformers/models/esm/openfold_utils/loss.py index e9523491d519d7..8c442786dc82ba 100644 --- a/src/transformers/models/esm/openfold_utils/loss.py +++ b/src/transformers/models/esm/openfold_utils/loss.py @@ -59,7 +59,7 @@ def compute_predicted_aligned_error( boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device) aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1) - (predicted_aligned_error, max_predicted_aligned_error,) = _calculate_expected_aligned_error( + predicted_aligned_error, max_predicted_aligned_error = _calculate_expected_aligned_error( alignment_confidence_breaks=boundaries, aligned_distance_error_probs=aligned_confidence_probs, ) diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py index 6cab95652c63b3..8f0ad3b50c6505 100644 --- a/src/transformers/models/esm/openfold_utils/residue_constants.py +++ b/src/transformers/models/esm/openfold_utils/residue_constants.py @@ -399,11 +399,13 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis @functools.lru_cache(maxsize=None) -def load_stereo_chemical_props() -> Tuple[ - Mapping[str, List[Bond]], - Mapping[str, List[Bond]], - Mapping[str, List[BondAngle]], -]: +def load_stereo_chemical_props() -> ( + Tuple[ + Mapping[str, List[Bond]], + Mapping[str, List[Bond]], + Mapping[str, List[BondAngle]], + ] +): """Load stereo_chemical_props.txt into a nice structure. Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py index 7d9c60338516e3..ba6d79891fa90d 100644 --- a/src/transformers/models/flaubert/configuration_flaubert.py +++ b/src/transformers/models/flaubert/configuration_flaubert.py @@ -180,7 +180,7 @@ def __init__( lang_id=0, pad_token_id=2, bos_token_id=0, - **kwargs + **kwargs, ): """Constructs FlaubertConfig.""" self.pre_norm = pre_norm diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 8e48a21285eac1..bb7790730115c3 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -98,7 +98,6 @@ def get_masks(slen, lengths, causal, padding_mask=None): # Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention class MultiHeadAttention(nn.Module): - NEW_ID = itertools.count() def __init__(self, n_heads, dim, config): diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index d52f41a57d65c5..919cd6cc1e4227 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -826,7 +826,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]: - transformer_outputs = self.transformer( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py index 5a0ac59c39dfd0..26f68e75d70579 100644 --- a/src/transformers/models/flaubert/tokenization_flaubert.py +++ b/src/transformers/models/flaubert/tokenization_flaubert.py @@ -245,9 +245,8 @@ def __init__( ], lang2id=None, id2lang=None, - **kwargs + **kwargs, ): - do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None) if do_lowercase_and_remove_accent is not None: logger.warning( diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index e74101203c501a..da2c9cc95cfbb3 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -109,7 +109,7 @@ def __init__( qkv_bias: bool = True, mask_token: bool = True, vocab_size: int = 8192, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -131,7 +131,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the image config dict if we are loading from FlavaConfig @@ -237,7 +236,7 @@ def __init__( layer_norm_eps: float = 1e-12, pad_token_id: int = 0, qkv_bias: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -259,7 +258,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from FlavaConfig @@ -343,7 +341,7 @@ def __init__( layer_norm_eps: float = 1e-12, qkv_bias: bool = True, use_cls_token: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -444,7 +442,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the image codebook config dict if we are loading from FlavaConfig @@ -555,7 +552,7 @@ def __init__( global_backprop_contrastive: bool = True, skip_unmasked_multimodal_encoder: bool = True, return_loss: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -621,7 +618,7 @@ def from_configs( text_config: FlavaTextConfig, multimodal_config: FlavaMultimodalConfig, image_codebook_config: FlavaImageCodebookConfig, - **kwargs + **kwargs, ): r""" Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py index c41bb37e72377c..9f54306112fe3a 100644 --- a/src/transformers/models/flava/image_processing_flava.py +++ b/src/transformers/models/flava/image_processing_flava.py @@ -254,7 +254,7 @@ def __init__( codebook_do_normalize: bool = True, codebook_image_mean: Optional[Union[float, Iterable[float]]] = None, codebook_image_std: Optional[Union[float, Iterable[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 224, "width": 224} @@ -338,7 +338,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. @@ -365,7 +365,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along @@ -389,7 +389,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -410,7 +410,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 52f34eca18447a..720e0659725d54 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -509,7 +509,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -569,7 +568,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None: # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 20562c84569ea1..e13716330fccca 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -78,7 +78,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ): """ This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py index 29dc4c0f9126fe..9efa06487756dd 100644 --- a/src/transformers/models/fnet/configuration_fnet.py +++ b/src/transformers/models/fnet/configuration_fnet.py @@ -103,7 +103,7 @@ def __init__( pad_token_id=3, bos_token_id=1, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py index 27b6563e5dd970..f77a44874ae429 100644 --- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py +++ b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py @@ -18,8 +18,8 @@ import argparse import torch - from flax.training.checkpoints import restore_checkpoint + from transformers import FNetConfig, FNetForPreTraining from transformers.utils import logging diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 75684025443b04..ebc58167b51725 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -185,7 +185,6 @@ def _init_fourier_transform(self, config): self.fourier_transform = fftn def forward(self, hidden_states): - # NOTE: We do not use torch.vmap as it is not integrated into PyTorch stable versions. # Interested users can modify the code to use vmap from the nightly versions, getting the vmap from here: # https://pytorch.org/docs/master/generated/torch.vmap.html. Note that fourier transform methods will need diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py index e7e3adfd793a50..6edcec45b5b03a 100644 --- a/src/transformers/models/fnet/tokenization_fnet.py +++ b/src/transformers/models/fnet/tokenization_fnet.py @@ -113,7 +113,7 @@ def __init__( cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. @@ -238,7 +238,7 @@ def _decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, spaces_between_special_tokens: bool = True, - **kwargs + **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py index 7cbd339c8b58b2..e71dbb8977acc2 100644 --- a/src/transformers/models/fnet/tokenization_fnet_fast.py +++ b/src/transformers/models/fnet/tokenization_fnet_fast.py @@ -104,7 +104,7 @@ def __init__( pad_token="", cls_token="[CLS]", mask_token="[MASK]", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it and # is included in the raw text, there should be a match in a non-normalized sentence. diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py index 85775d07b2e315..decfb1b90f9cba 100644 --- a/src/transformers/models/fsmt/configuration_fsmt.py +++ b/src/transformers/models/fsmt/configuration_fsmt.py @@ -171,7 +171,7 @@ def __init__( bos_token_id=0, eos_token_id=2, forced_eos_token_id=2, - **common_kwargs + **common_kwargs, ): self.langs = langs self.src_vocab_size = src_vocab_size diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py index 85f5290a9ebd21..ef2764f0ed10ba 100755 --- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py @@ -88,7 +88,6 @@ def rewrite_dict_keys(d): def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): - # prep assert os.path.exists(fsmt_checkpoint_path) os.makedirs(pytorch_dump_folder_path, exist_ok=True) diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index ac22dea6fcc079..408967bd0590e3 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -1273,7 +1273,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): return { "input_ids": None, # encoder_outputs is defined. input_ids not needed diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py index 9b9dc0bc956ccd..1c401c1faa9ee3 100644 --- a/src/transformers/models/fsmt/tokenization_fsmt.py +++ b/src/transformers/models/fsmt/tokenization_fsmt.py @@ -195,7 +195,7 @@ def __init__( bos_token="", sep_token="", pad_token="", - **kwargs + **kwargs, ): super().__init__( langs=langs, diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py index ddaa7d9b5ed493..8d87ae23a77045 100644 --- a/src/transformers/models/funnel/configuration_funnel.py +++ b/src/transformers/models/funnel/configuration_funnel.py @@ -124,7 +124,7 @@ def __init__( separate_cls=True, truncate_seq=True, pool_q_only=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.block_sizes = block_sizes diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index e2def568d1cdba..2b109cdbab8a2f 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -771,7 +771,6 @@ def call( return_dict=None, training=False, ): - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -1177,7 +1176,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]: - return self.funnel( input_ids=input_ids, attention_mask=attention_mask, @@ -1225,7 +1223,7 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs + **kwargs, ) -> Union[Tuple[tf.Tensor], TFFunnelForPreTrainingOutput]: r""" Returns: diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 476fec51a8a45b..245694bfac52ef 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -155,7 +155,7 @@ def __init__( eos_token="", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -209,7 +209,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py index 60be9fbcd76956..864303eb210153 100644 --- a/src/transformers/models/funnel/tokenization_funnel_fast.py +++ b/src/transformers/models/funnel/tokenization_funnel_fast.py @@ -158,7 +158,7 @@ def __init__( tokenize_chinese_chars=True, strip_accents=None, wordpieces_prefix="##", - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index fc933f32591c52..7840ce63de65b3 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -91,7 +91,7 @@ def __init__( layer_norm_eps=1e-5, attention_dropout=0.0, initializer_range=0.02, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -109,7 +109,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from GITConfig @@ -211,7 +210,7 @@ def __init__( bos_token_id=101, eos_token_id=102, num_image_with_embedding=None, - **kwargs + **kwargs, ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py index e4ebf645ae7afb..e089ec89854f96 100644 --- a/src/transformers/models/git/convert_git_to_pytorch.py +++ b/src/transformers/models/git/convert_git_to_pytorch.py @@ -21,12 +21,12 @@ from pathlib import Path import numpy as np +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor -import requests -from huggingface_hub import hf_hub_download from transformers import ( AutoTokenizer, CLIPImageProcessor, diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index b8c95934f8e7e8..96fa94d9a7df9e 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -443,7 +443,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py index aec3d1c4679527..9951d1615cc622 100644 --- a/src/transformers/models/glpn/configuration_glpn.py +++ b/src/transformers/models/glpn/configuration_glpn.py @@ -112,7 +112,7 @@ def __init__( decoder_hidden_size=64, max_depth=10, head_in_index=-1, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py index d083ff8271d802..fa0af4691a1439 100644 --- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py +++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py @@ -19,10 +19,10 @@ from collections import OrderedDict from pathlib import Path +import requests import torch from PIL import Image -import requests from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation from transformers.utils import logging diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 0533d4c24271d0..ba279c3c4ff48c 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -57,7 +57,7 @@ def __init__( size_divisor: int = 32, resample=PILImageResampling.BILINEAR, do_rescale: bool = True, - **kwargs + **kwargs, ) -> None: self.do_resize = do_resize self.do_rescale = do_rescale diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py index 8e375b6dbe6035..2a449360b8893d 100644 --- a/src/transformers/models/gpt2/modeling_flax_gpt2.py +++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py @@ -199,7 +199,6 @@ def __call__( init_cache: bool = False, output_attentions: bool = False, ): - # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 1a7ba62c4146a2..9197a1f56dd53b 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -844,7 +844,6 @@ def forward( all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) @@ -860,7 +859,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index c08e864b8d37f9..62df70cce4b93f 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -163,7 +163,6 @@ def call( output_attentions, training=False, ): - if encoder_hidden_states is not None: if not hasattr(self, "q_attn"): raise ValueError( @@ -230,7 +229,6 @@ def __init__(self, config, scale=False, **kwargs): self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") if config.add_cross_attention: - self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True) self.ln_cross_attn = tf.keras.layers.LayerNormalization( epsilon=config.layer_norm_epsilon, name="ln_cross_attn" @@ -365,7 +363,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 1be35fbfdf4919..c462e45d01e3df 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -165,7 +165,7 @@ def __init__( pad_token=None, add_prefix_space=False, add_bos_token=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index eefd35aa94a7a9..7d7500ee9cca3b 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -133,7 +133,7 @@ def __init__( bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py index ba6f754373c5c7..4ab4af5b9d66f3 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py @@ -2,7 +2,6 @@ from typing import Dict, List, Union import tensorflow as tf - from keras_nlp.tokenizers import BytePairTokenizer from tensorflow_text import pad_model_inputs diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py index be48e95d2ccd26..f7d59cfca3c1c4 100644 --- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py +++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py @@ -116,7 +116,7 @@ def __init__( use_cache=True, bos_token_id=50256, eos_token_id=50256, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings @@ -221,7 +221,6 @@ def generate_dummy_inputs( is_pair: bool = False, framework: Optional[TensorType] = None, ) -> Mapping[str, Any]: - common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework ) diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 7a5a913292e32b..68084f3b7cbbe9 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -219,7 +219,6 @@ def forward( use_cache=False, output_attentions=False, ): - query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -596,7 +595,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index a5ba1fddd93a21..7d2d87e65f710a 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -103,7 +103,7 @@ def __init__( eos_token_id=2, tie_word_embeddings=False, use_parallel_residual=True, - **kwargs + **kwargs, ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 589eaae7804918..7fed1ad556249f 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -315,7 +315,6 @@ def forward( layer_past=None, output_attentions=False, ): - attention_layer_outputs = self.attention( self.input_layernorm(hidden_states), attention_mask=attention_mask, @@ -522,7 +521,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -577,7 +575,6 @@ def custom_forward(*inputs): """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING ) class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index c08d533835d708..1d4c1cec3a754f 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -106,7 +106,7 @@ def __init__( bos_token="<|endoftext|>", eos_token="<|endoftext|>", add_prefix_space=False, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index d0df7ef44d245e..8d8519b9eae8bf 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -101,7 +101,7 @@ def __init__( eos_token_id=31999, attention_dropout=0.1, hidden_dropout=0.0, - **kwargs + **kwargs, ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index 76df8beee9cc71..8badb60f97b550 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -590,7 +590,6 @@ def forward( GPT_NEOX_JAPANESE_START_DOCSTRING, ) class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"] def __init__(self, config): diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index a132d999a31370..c9f4f677cb483e 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -129,7 +129,7 @@ def __init__( bos_token="<|startoftext|>", eos_token="<|endoftext|>", do_clean_text=False, - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, diff --git a/src/transformers/models/gpt_sw3/__init__.py b/src/transformers/models/gpt_sw3/__init__.py index c9e6dca3ef6efa..3d5245364a3aed 100644 --- a/src/transformers/models/gpt_sw3/__init__.py +++ b/src/transformers/models/gpt_sw3/__init__.py @@ -33,7 +33,6 @@ if TYPE_CHECKING: - try: if not is_sentencepiece_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index bb2025381c72f2..f982c5b6b17164 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -115,9 +115,8 @@ def __init__( eos_token=None, bos_token=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs name_or_path = kwargs.get("name_or_path") diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py index dd0976cb293183..b40861c354be76 100644 --- a/src/transformers/models/gptj/configuration_gptj.py +++ b/src/transformers/models/gptj/configuration_gptj.py @@ -112,7 +112,7 @@ def __init__( bos_token_id=50256, eos_token_id=50256, tie_word_embeddings=False, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.n_positions = n_positions diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py index 1f00893b5c873b..6270355129ff27 100644 --- a/src/transformers/models/gptj/modeling_flax_gptj.py +++ b/src/transformers/models/gptj/modeling_flax_gptj.py @@ -16,11 +16,10 @@ from functools import partial from typing import Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -212,7 +211,6 @@ def __call__( init_cache: bool = False, output_attentions: bool = False, ): - query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 84282fb0736781..3459a93b5dd43a 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -152,7 +152,6 @@ def _attn( attention_mask=None, head_mask=None, ): - # compute causal mask from causal mask buffer query_length, key_length = query.size(-2), key.size(-2) causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) @@ -199,7 +198,6 @@ def forward( Tuple[torch.Tensor, Tuple[torch.Tensor]], Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], ]: - query = self.q_proj(hidden_states) key = self.k_proj(hidden_states) value = self.v_proj(hidden_states) @@ -627,7 +625,6 @@ def forward( all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) @@ -643,7 +640,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 244d4d3ba951b8..f077a52a03ae02 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -386,7 +386,6 @@ def call( return_dict=None, training=False, ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/graphormer/algos_graphormer.pyx b/src/transformers/models/graphormer/algos_graphormer.pyx index 981a2b6299ea85..a0fafbdee53b55 100644 --- a/src/transformers/models/graphormer/algos_graphormer.pyx +++ b/src/transformers/models/graphormer/algos_graphormer.pyx @@ -4,7 +4,6 @@ import cython cimport numpy - from cython.parallel cimport parallel, prange import numpy as np diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py index 82b8b9f87620db..c5d293a6135077 100755 --- a/src/transformers/models/graphormer/modeling_graphormer.py +++ b/src/transformers/models/graphormer/modeling_graphormer.py @@ -799,7 +799,7 @@ def forward( perturb=None, masked_tokens=None, return_dict: Optional[bool] = None, - **unused + **unused, ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index 98526caa635fe9..ded75904c55532 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -108,7 +108,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -127,7 +127,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from GroupViTConfig @@ -221,7 +220,7 @@ def __init__( initializer_factor=1.0, assign_eps=1.0, assign_mlp_ratio=[0.5, 4], - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -251,7 +250,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from GroupViTConfig @@ -303,7 +301,7 @@ def __init__( projection_dim=256, projection_intermediate_dim=4096, logit_scale_init_value=2.6592, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -392,7 +390,6 @@ def generate_dummy_inputs( seq_length: int = -1, framework: Optional["TensorType"] = None, ) -> Mapping[str, Any]: - text_input_dict = super().generate_dummy_inputs( processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework ) diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py index e83bdd35cb37cb..059f10f6129bee 100644 --- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py +++ b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py @@ -21,10 +21,10 @@ import argparse +import requests import torch from PIL import Image -import requests from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 8a5f247e6537fa..023f9370aa2fda 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -195,7 +195,6 @@ def __init__(self, config: GroupViTVisionConfig): self.assign_eps = config.assign_eps def get_attn(self, attn, gumbel=True, hard=True): - if gumbel and self.training: attn = gumbel_softmax(attn, dim=-2, hard=hard) else: @@ -931,7 +930,6 @@ def forward( output_attentions: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[tuple, BaseModelOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index 006bd868c9b7f1..3826b83e7a4637 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -291,7 +291,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.assign_eps = config.assign_eps def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor: - if gumbel and training: attn = gumbel_softmax(attn, dim=-2, hard=hard) else: @@ -474,7 +473,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs): self.config = config def build(self, input_shape: tf.TensorShape): - num_patches = self.patch_embeddings.num_patches self.position_embeddings = self.add_weight( shape=(1, num_patches, self.config.hidden_size), @@ -540,7 +538,6 @@ def __init__(self, config: GroupViTTextConfig, **kwargs): self.config = config def build(self, input_shape: tf.TensorShape): - with tf.name_scope("token_embedding"): self.weight = self.add_weight( shape=(self.config.vocab_size, self.embed_dim), @@ -1104,7 +1101,6 @@ def call( return_dict: bool, training: bool = False, ) -> Union[Tuple, TFBaseModelOutputWithPooling]: - embedding_output = self.embeddings(pixel_values) encoder_outputs = self.encoder( @@ -1202,7 +1198,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1264,7 +1259,6 @@ def __init__(self, config: GroupViTConfig, **kwargs): ] def build(self, input_shape: tf.TensorShape): - self.logit_scale = self.add_weight( shape=(1,), initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value), @@ -1285,7 +1279,6 @@ def get_text_features( return_dict: Optional[bool] = None, training: bool = False, ) -> tf.Tensor: - if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -1320,7 +1313,6 @@ def get_image_features( return_dict: Optional[bool] = None, training: bool = False, ) -> tf.Tensor: - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1353,7 +1345,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]: - if input_ids is None: raise ValueError("You have to specify either input_ids") if pixel_values is None: diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py index 479446787a3fe2..80c6cb6d63e133 100644 --- a/src/transformers/models/herbert/tokenization_herbert.py +++ b/src/transformers/models/herbert/tokenization_herbert.py @@ -320,9 +320,8 @@ def __init__( ], lang2id=None, id2lang=None, - **kwargs + **kwargs, ): - super().__init__( unk_token=unk_token, bos_token=bos_token, @@ -486,7 +485,6 @@ def bpe(self, token): return word def _tokenize(self, text): - pre_tokens = self.bert_pre_tokenizer.tokenize(text) split_tokens = [] diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py index 234ad4a5679190..67e38c1c5ee7bd 100644 --- a/src/transformers/models/herbert/tokenization_herbert_fast.py +++ b/src/transformers/models/herbert/tokenization_herbert_fast.py @@ -72,9 +72,8 @@ def __init__( pad_token="", mask_token="", sep_token="", - **kwargs + **kwargs, ): - super().__init__( vocab_file, merges_file, diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index be2e6bbf4c71ab..139df45bbb791d 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -196,7 +196,7 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py index d7ba74fedae7b2..571761e022846f 100644 --- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py +++ b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py @@ -18,8 +18,8 @@ import argparse import torch - from s3prl.hub import distilhubert + from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index a96ef5cf5db44a..b1490557820541 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1180,7 +1180,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index df34adc66dbec0..f4722532e8a3e3 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -318,7 +318,6 @@ def __init__( self._check_axis() def build(self, input_shape): - self._check_if_input_shape_is_none(input_shape) self._set_number_of_groups_for_instance_norm(input_shape) self._check_size_of_dimensions(input_shape) @@ -330,7 +329,6 @@ def build(self, input_shape): super().build(input_shape) def call(self, inputs): - input_shape = tf.keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) @@ -367,7 +365,6 @@ def compute_output_shape(self, input_shape): return input_shape def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): - group_shape = [tensor_input_shape[i] for i in range(len(input_shape))] is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -380,7 +377,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): - group_shape = tf.keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 @@ -432,7 +428,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape): self.groups = dim def _check_size_of_dimensions(self, input_shape): - dim = input_shape[self.axis] if dim < self.groups: raise ValueError( @@ -453,19 +448,16 @@ def _check_size_of_dimensions(self, input_shape): ) def _check_axis(self): - if self.axis == 0: raise ValueError( "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead" ) def _create_input_spec(self, input_shape): - dim = input_shape[self.axis] self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) @@ -481,7 +473,6 @@ def _add_gamma_weight(self, input_shape): self.gamma = None def _add_beta_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) @@ -1647,7 +1638,6 @@ def call( logits = self.lm_head(hidden_states) if labels is not None: - if tf.reduce_max(labels) >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py index 32d4d2e56a809e..fe46e3fca61539 100644 --- a/src/transformers/models/ibert/configuration_ibert.py +++ b/src/transformers/models/ibert/configuration_ibert.py @@ -111,7 +111,7 @@ def __init__( position_embedding_type="absolute", quant_mode=False, force_dequant="none", - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py index fa657924645e93..8e2f123c578c0b 100644 --- a/src/transformers/models/ibert/quant_modules.py +++ b/src/transformers/models/ibert/quant_modules.py @@ -163,7 +163,6 @@ def forward( specified_min=None, specified_max=None, ): - x_act = x if identity is None else identity + x # collect running stats if training if self.training: @@ -663,7 +662,6 @@ def forward(ctx, x, k, percentile_mode, scale): @staticmethod def backward(ctx, grad_output): - scale = ctx.scale if len(grad_output.shape) == 4: scale = scale.view(-1, 1, 1, 1) @@ -771,7 +769,6 @@ def forward( identity=None, identity_scaling_factor=None, ): - if len(pre_act_scaling_factor.shape) == 3: reshape = lambda x: x # noqa: E731 else: diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py index af31fdf1918f33..451b4a265feb0a 100644 --- a/src/transformers/models/imagegpt/image_processing_imagegpt.py +++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py @@ -91,7 +91,7 @@ def __init__( resample: PILImageResampling = PILImageResampling.BILINEAR, do_normalize: bool = True, do_color_quantize: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 256, "width": 256} @@ -109,7 +109,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]). diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 4e52ef5c071d66..d7537c9dc35534 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -796,7 +796,6 @@ def forward( all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): - # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) @@ -812,7 +811,6 @@ def forward( all_hidden_states = all_hidden_states + (hidden_states,) if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py index 6ce345a8578e28..e705af931ec301 100644 --- a/src/transformers/models/jukebox/configuration_jukebox.py +++ b/src/transformers/models/jukebox/configuration_jukebox.py @@ -301,7 +301,7 @@ def __init__( spread=None, timing_dims=64, zero_out=False, - **kwargs + **kwargs, ): self.act_fn = act_fn self.alignment_head = alignment_head @@ -459,7 +459,7 @@ def __init__( sample_length=1058304, init_scale=0.2, zero_out=False, - **kwargs + **kwargs, ): self.hop_fraction = hop_fraction self.conv_input_shape = conv_input_shape @@ -486,7 +486,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from CLIPConfig @@ -576,7 +575,6 @@ def __init__( init_std=0.2, **kwargs, ): - if vqvae_config is None: vqvae_config = {} logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.") diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py index c8d0831e53f3de..8625cbe868883b 100644 --- a/src/transformers/models/jukebox/convert_jukebox.py +++ b/src/transformers/models/jukebox/convert_jukebox.py @@ -19,9 +19,9 @@ import os from pathlib import Path +import requests import torch -import requests from transformers import JukeboxConfig, JukeboxModel from transformers.utils import logging @@ -116,7 +116,6 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping): re_prior_cond_proj_in = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)") for original_key, value in state_dict.items(): - # rename vqvae.encoder keys if re_encoder_block_conv_in.fullmatch(original_key): regex_match = re_encoder_block_conv_in.match(original_key) diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py index 38fed91e1b01bb..2528f1aa2272bf 100755 --- a/src/transformers/models/jukebox/modeling_jukebox.py +++ b/src/transformers/models/jukebox/modeling_jukebox.py @@ -1625,7 +1625,6 @@ class JukeboxMusicTokenConditioner(nn.Module): """ def __init__(self, config, level): - super().__init__() self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size) config.embed_dim = config.music_vocab_size # setting correct argument for the `JukeboxDecoder` diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py index 01bada0e0806b4..85835c6cdf4201 100644 --- a/src/transformers/models/jukebox/tokenization_jukebox.py +++ b/src/transformers/models/jukebox/tokenization_jukebox.py @@ -23,8 +23,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - import regex + from transformers.utils.generic import _is_jax, _is_numpy from ...tokenization_utils import AddedToken, PreTrainedTokenizer @@ -126,7 +126,7 @@ def __init__( max_n_lyric_tokens=512, n_genres=5, unk_token="<|endoftext|>", - **kwargs + **kwargs, ): unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token super().__init__( diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py index c1e24acffa4562..f297adea1420c1 100644 --- a/src/transformers/models/layoutlm/configuration_layoutlm.py +++ b/src/transformers/models/layoutlm/configuration_layoutlm.py @@ -112,7 +112,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, max_2d_position_embeddings=1024, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 3b696d778adac3..d23c68840064d3 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -478,7 +478,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py index 41267ab1f52939..2097ae58b8bf35 100644 --- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py @@ -717,7 +717,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index bc7fbc9bb95baa..99f517c6a2a267 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -132,7 +132,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -180,7 +180,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py index 42d664c061c790..7ba06d7fa1107e 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py @@ -117,7 +117,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py index db14690387aa90..3cc8027c1dd5c8 100644 --- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py @@ -145,7 +145,7 @@ def __init__( has_spatial_attention_bias=True, has_visual_segment_embedding=False, detectron2_config_args=None, - **kwargs + **kwargs, ): super().__init__( vocab_size=vocab_size, diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py index 04547eebd8ed95..ca01b3670d09c2 100644 --- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py @@ -136,7 +136,7 @@ def __init__( apply_ocr: bool = True, ocr_lang: Optional[str] = None, tesseract_config: Optional[str] = "", - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 224, "width": 224} @@ -155,7 +155,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py index 4ddd95bfbe5614..280b93043afeb1 100644 --- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py @@ -85,7 +85,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index 306c4f34f1ff74..c6a8857325f4ee 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -242,7 +242,7 @@ def __init__( strip_accents=None, model_max_length: int = 512, additional_special_tokens: Optional[List[str]] = None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -304,7 +304,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) @@ -444,7 +443,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -463,6 +462,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -596,9 +596,8 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, @@ -655,9 +654,8 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -779,7 +777,7 @@ def encode( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> List[int]: encoded_inputs = self.encode_plus( text=text, @@ -826,7 +824,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -892,7 +890,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -946,7 +944,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, @@ -1215,8 +1213,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py index a2059dbf743a34..bed4e133aa3c5c 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py @@ -136,7 +136,7 @@ def __init__( only_label_first_subword=True, tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, @@ -197,7 +197,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -216,6 +216,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -349,9 +350,8 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, @@ -413,7 +413,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -484,7 +484,6 @@ def _batch_encode_plus( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: - if not isinstance(batch_text_or_text_pairs, list): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") @@ -636,9 +635,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # make it a batched input # 2 options: # 1) only text, in case text must be a list of str diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py index 96876ced612e7f..e39e5b61e299bb 100644 --- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py @@ -153,7 +153,7 @@ def __init__( num_channels=3, patch_size=16, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__( vocab_size=vocab_size, @@ -191,7 +191,6 @@ def __init__( class LayoutLMv3OnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.12") @property diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py index c2cd270846c902..aec81de1a99ec2 100644 --- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py @@ -157,7 +157,7 @@ def __init__( apply_ocr: bool = True, ocr_lang: Optional[str] = None, tesseract_config: Optional[str] = "", - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 224, "width": 224} @@ -181,7 +181,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to (size["height"], size["width"]) dimensions. @@ -207,7 +207,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Rescale an image by a scale factor. image = image * scale. @@ -228,7 +228,7 @@ def normalize( mean: Union[float, Iterable[float]], std: Union[float, Iterable[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py index 9c6a8416d51f1f..4d406962378005 100644 --- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py @@ -85,7 +85,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py index 13a797be523ce4..b9c0ab127d42ca 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py @@ -291,7 +291,7 @@ def __init__( pad_token_box=[0, 0, 0, 0], pad_token_label=-100, only_label_first_subword=True, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -566,7 +566,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -585,6 +585,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -719,9 +720,8 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, @@ -779,9 +779,8 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -905,7 +904,7 @@ def encode( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> List[int]: encoded_inputs = self.encode_plus( text=text, @@ -953,7 +952,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -1020,7 +1019,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -1074,7 +1073,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, @@ -1346,8 +1345,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index 98e2ee3e320f27..4bd3e91480c02b 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -156,7 +156,7 @@ def __init__( pad_token_box=[0, 0, 0, 0], pad_token_label=-100, only_label_first_subword=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, @@ -243,7 +243,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -262,6 +262,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -396,9 +397,8 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, @@ -462,7 +462,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -534,7 +534,6 @@ def _batch_encode_plus( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: - if not isinstance(batch_text_or_text_pairs, list): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") @@ -687,9 +686,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # make it a batched input # 2 options: # 1) only text, in case text must be a list of str diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py index 49fbb1ac3ddc77..250c8c7fc6140c 100644 --- a/src/transformers/models/layoutxlm/processing_layoutxlm.py +++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py @@ -65,7 +65,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py index d825bd8f36248f..47c5315457b4fa 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py @@ -247,7 +247,7 @@ def __init__( pad_token_label=-100, only_label_first_subword=True, sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token @@ -462,7 +462,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -481,6 +481,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -613,9 +614,8 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -736,7 +736,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -790,7 +790,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py index 439d3994781c04..322239192740d0 100644 --- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py +++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py @@ -233,7 +233,7 @@ def __init__( pad_token_box=[0, 0, 0, 0], pad_token_label=-100, only_label_first_subword=True, - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token @@ -287,7 +287,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -306,6 +306,7 @@ def __call__( word_labels (`List[int]`, `List[List[int]]`, *optional*): Word-level integer labels (for token classification tasks such as FUNSD, CORD). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -448,7 +449,6 @@ def _batch_encode_plus( verbose: bool = True, **kwargs, ) -> BatchEncoding: - if not isinstance(batch_text_or_text_pairs, list): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") @@ -600,9 +600,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - # make it a batched input # 2 options: # 1) only text, in case text must be a list of str diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py index 98d2e32f62e4e8..34c286ce18910f 100644 --- a/src/transformers/models/led/configuration_led.py +++ b/src/transformers/models/led/configuration_led.py @@ -132,7 +132,7 @@ def __init__( bos_token_id=0, eos_token_id=2, attention_window: Union[List[int], int] = 512, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_encoder_position_embeddings = max_encoder_position_embeddings diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 7d018a52d9e073..b60f088de808c1 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2135,7 +2135,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index e45d94c742f8e9..75d4a15f194d11 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -1769,7 +1769,7 @@ def call( if global_attention_mask is not None: attention_mask = attention_mask * tf.cast((global_attention_mask + 1), dtype=attention_mask.dtype) - (padding_len, input_ids, attention_mask, inputs_embeds,) = self._pad_to_window_size( + padding_len, input_ids, attention_mask, inputs_embeds = self._pad_to_window_size( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, @@ -1809,7 +1809,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: hidden_states_to_add = self.compute_hidden_states(hidden_states, padding_len) encoder_states = encoder_states + (hidden_states_to_add,) @@ -2188,9 +2187,8 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): - if decoder_input_ids is None and decoder_inputs_embeds is None: use_cache = False @@ -2290,9 +2288,8 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): - outputs = self.led( input_ids=input_ids, attention_mask=attention_mask, @@ -2516,7 +2513,7 @@ def prepare_inputs_for_generation( decoder_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py index 812e374c7a485d..5b22701a226224 100644 --- a/src/transformers/models/led/tokenization_led.py +++ b/src/transformers/models/led/tokenization_led.py @@ -185,7 +185,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py index ee9118f11d680f..153b32c2967bf4 100644 --- a/src/transformers/models/led/tokenization_led_fast.py +++ b/src/transformers/models/led/tokenization_led_fast.py @@ -148,7 +148,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/levit/configuration_levit.py b/src/transformers/models/levit/configuration_levit.py index 525221217ad450..06c7925a8f3797 100644 --- a/src/transformers/models/levit/configuration_levit.py +++ b/src/transformers/models/levit/configuration_levit.py @@ -105,7 +105,7 @@ def __init__( mlp_ratio=[2, 2, 2], attention_ratio=[2, 2, 2], initializer_range=0.02, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.image_size = image_size @@ -130,7 +130,6 @@ def __init__( # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig class LevitOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py index de8826ce61d3e0..a6ae6603e5de9f 100644 --- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py +++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py @@ -21,10 +21,10 @@ from functools import partial from pathlib import Path -import torch - import timm +import torch from huggingface_hub import hf_hub_download + from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher from transformers.utils import logging diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py index b369bfd33ad9f0..6aef221a16a8cd 100644 --- a/src/transformers/models/levit/image_processing_levit.py +++ b/src/transformers/models/levit/image_processing_levit.py @@ -99,7 +99,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_MEAN, image_std: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_STD, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -124,7 +124,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -168,7 +168,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. @@ -191,7 +191,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Rescale an image by a scale factor. image = image * scale. @@ -212,7 +212,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py index 16ec1d658f0ed0..d11899c94312ad 100644 --- a/src/transformers/models/lilt/configuration_lilt.py +++ b/src/transformers/models/lilt/configuration_lilt.py @@ -111,7 +111,7 @@ def __init__( classifier_dropout=None, channel_shrink_ratio=4, max_2d_position_embeddings=1024, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index b372ddd9fca545..6a2b820b4ff8b7 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -239,7 +239,6 @@ def forward( head_mask=None, output_attentions=False, ): - layout_value_layer = self.transpose_for_scores(self.layout_value(layout_inputs), r=self.channel_shrink_ratio) layout_key_layer = self.transpose_for_scores(self.layout_key(layout_inputs), r=self.channel_shrink_ratio) layout_query_layer = self.transpose_for_scores(self.layout_query(layout_inputs), r=self.channel_shrink_ratio) diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py index 1e8eea812db9cb..3f3e2da7e830e8 100644 --- a/src/transformers/models/longformer/configuration_longformer.py +++ b/src/transformers/models/longformer/configuration_longformer.py @@ -133,7 +133,7 @@ def __init__( layer_norm_eps: float = 1e-12, position_embedding_type: str = "absolute", onnx_export: bool = False, - **kwargs + **kwargs, ): """Constructs LongformerConfig.""" super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py index 4d9ebe017a1d86..ed7d32ab3edbef 100644 --- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py +++ b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py @@ -39,7 +39,6 @@ def forward(self): def convert_longformer_qa_checkpoint_to_pytorch( longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str ): - # load longformer model from model identifier longformer = LongformerModel.from_pretrained(longformer_model) lightning_model = LightningModel(longformer) diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 44c8a5e31c7779..6ad6bdad6711a6 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1886,7 +1886,6 @@ def forward( LONGFORMER_START_DOCSTRING, ) class LongformerForSequenceClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -2014,7 +2013,6 @@ def forward(self, hidden_states, **kwargs): LONGFORMER_START_DOCSTRING, ) class LongformerForQuestionAnswering(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -2154,7 +2152,6 @@ def forward( LONGFORMER_START_DOCSTRING, ) class LongformerForTokenClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index 10758ec7005e0c..e5e22a21276faf 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -1692,7 +1692,6 @@ def call( return_dict=None, training=False, ): - if input_ids is not None and not isinstance(input_ids, tf.Tensor): input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64) elif input_ids is not None: @@ -2060,7 +2059,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[TFLongformerBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - outputs = self.longformer( input_ids=input_ids, attention_mask=attention_mask, @@ -2393,7 +2391,6 @@ def call( labels: Optional[Union[np.ndarray, tf.Tensor]] = None, training: Optional[bool] = False, ) -> Union[TFLongformerSequenceClassifierOutput, Tuple[tf.Tensor]]: - if input_ids is not None and not isinstance(input_ids, tf.Tensor): input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64) elif input_ids is not None: diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py index 64bbeeb8ce51ae..5ff6f70afdeafb 100644 --- a/src/transformers/models/longformer/tokenization_longformer.py +++ b/src/transformers/models/longformer/tokenization_longformer.py @@ -207,7 +207,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py index 089ee69d668e9c..5d20caf8c2dfa0 100644 --- a/src/transformers/models/longformer/tokenization_longformer_fast.py +++ b/src/transformers/models/longformer/tokenization_longformer_fast.py @@ -188,7 +188,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py index 705fdc4939584b..0927d13034675b 100644 --- a/src/transformers/models/longt5/configuration_longt5.py +++ b/src/transformers/models/longt5/configuration_longt5.py @@ -108,9 +108,8 @@ def __init__( use_cache=True, pad_token_id=0, eos_token_id=1, - **kwargs + **kwargs, ): - self.vocab_size = vocab_size self.d_model = d_model self.d_kv = d_kv diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py index 41cc3a2005dd94..5a1394c719d2d8 100644 --- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py +++ b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py @@ -20,6 +20,7 @@ import argparse from t5x import checkpoints + from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py index de3e43f02cc5fd..458f6d597f30c8 100644 --- a/src/transformers/models/longt5/modeling_flax_longt5.py +++ b/src/transformers/models/longt5/modeling_flax_longt5.py @@ -18,11 +18,10 @@ import copy from typing import Any, Callable, List, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -1681,7 +1680,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -2392,7 +2391,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 486eda387686af..1039b1cc5b0374 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -231,7 +231,6 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): - # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -1181,7 +1180,6 @@ def forward( output_attentions=False, return_dict=True, ): - if past_key_value is not None: if not self.is_decoder: logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") @@ -1348,8 +1346,8 @@ def _shift_right(self, input_ids): pad_token_id = self.config.pad_token_id assert decoder_start_token_id is not None, ( - "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the" - " pad_token_id. See LongT5 docs for more information" + "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id." + " See LongT5 docs for more information" ) # shift inputs to the right @@ -2096,9 +2094,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past is used if past_key_values is not None: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py index 8f7438cc3c6a91..6e5c99900bbdf5 100644 --- a/src/transformers/models/luke/configuration_luke.py +++ b/src/transformers/models/luke/configuration_luke.py @@ -114,7 +114,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): """Constructs LukeConfig.""" super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 25427f8a63a3cd..8df68e9dd1dceb 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -1022,7 +1022,6 @@ def _set_gradient_checkpointing(self, module, value=False): LUKE_START_DOCSTRING, ) class LukeModel(LukePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] def __init__(self, config: LukeConfig, add_pooling_layer: bool = True): diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index 9a5f6d42a665d2..ff177a44442c71 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -22,7 +22,6 @@ from typing import Dict, List, Optional, Tuple, Union import numpy as np - import regex as re from ...tokenization_utils import PreTrainedTokenizer @@ -313,9 +312,8 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token @@ -597,7 +595,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -742,9 +740,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -824,7 +821,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -916,7 +913,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa ) if entities is not None: - if not isinstance(entities, list): raise ValueError("If you specify entities, they should be given as a list") @@ -934,7 +930,7 @@ def _create_input_sequence( entities_pair: Optional[EntityInput] = None, entity_spans: Optional[EntitySpanInput] = None, entity_spans_pair: Optional[EntitySpanInput] = None, - **kwargs + **kwargs, ) -> Tuple[list, list, list, list, list, list]: def get_input_ids(text): tokens = self.tokenize(text, **kwargs) @@ -975,7 +971,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans): first_entity_token_spans, second_entity_token_spans = None, None if self.task is None: - if entity_spans is None: first_ids = get_input_ids(text) else: @@ -1058,7 +1053,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans): first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:] elif self.task == "entity_span_classification": - if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)): raise ValueError( "Entity spans should be provided as a list of tuples, " @@ -1187,7 +1181,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids, diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 572fa30d8dd172..9fe7ecb730fdac 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -526,7 +526,6 @@ def forward( visual_attention_mask, output_attentions=False, ): - lang_att_output, visual_att_output = self.cross_att( lang_input=lang_feats, lang_attention_mask=lang_attention_mask, @@ -609,7 +608,6 @@ def forward( visual_attention_mask=None, output_attentions=None, ): - vision_hidden_states = () language_hidden_states = () vision_attentions = () if output_attentions or self.config.output_attentions else None @@ -916,7 +914,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[LxmertModelOutput, Tuple[torch.FloatTensor]]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1123,7 +1120,6 @@ def _set_qa_logit_layer(self, qa_logit_layer): self.answer_head.logit_fc[-1] = qa_logit_layer def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels): - if num_labels is None: return cur_qa_logit_layer @@ -1355,7 +1351,6 @@ def _set_qa_logit_layer(self, qa_logit_layer): self.answer_head.logit_fc[-1] = qa_logit_layer def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels): - if num_labels is None: return cur_qa_logit_layer diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py index a657ddb94b5c23..84d05bebce2e85 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert.py +++ b/src/transformers/models/lxmert/tokenization_lxmert.py @@ -124,7 +124,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -172,7 +172,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py index 1b804f5239b202..8e58a3aafac5c1 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py +++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py @@ -103,7 +103,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index 0ab2365accd34a..453f8d45f3dca5 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -128,7 +128,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index c354b9503d418e..58f0ca289c467f 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -1054,7 +1054,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting" @@ -1080,7 +1079,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=combined_attention_mask, diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py index 0a6ad7ec3a15f4..a2fdd41d7442e0 100644 --- a/src/transformers/models/marian/configuration_marian.py +++ b/src/transformers/models/marian/configuration_marian.py @@ -133,7 +133,7 @@ def __init__( eos_token_id=0, forced_eos_token_id=0, share_encoder_decoder_embeddings=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.decoder_vocab_size = decoder_vocab_size or vocab_size diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py index c175144623985f..f6b548c2b07f46 100644 --- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py @@ -20,9 +20,9 @@ from pathlib import Path from typing import Tuple +import yaml from tqdm import tqdm -import yaml from transformers.models.marian.convert_marian_to_pytorch import ( FRONT_MATTER_TEMPLATE, convert, diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py index 1fb5a34f064fd3..1662ffb358b44d 100644 --- a/src/transformers/models/marian/convert_marian_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_to_pytorch.py @@ -24,10 +24,10 @@ import numpy as np import torch +from huggingface_hub.hf_api import list_models from torch import nn from tqdm import tqdm -from huggingface_hub.hf_api import list_models from transformers import MarianConfig, MarianMTModel, MarianTokenizer @@ -631,7 +631,7 @@ def load_marian_model(self) -> MarianMTModel: model.model.decoder.embed_positions.weight = wpos_tensor if cfg.normalize_embedding: - if not ("encoder_emb_ln_scale_pre" in state_dict): + if "encoder_emb_ln_scale_pre" not in state_dict: raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary") raise NotImplementedError("Need to convert layernorm_embedding") diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py index db543ef8d9c940..96b26f8325ce8c 100644 --- a/src/transformers/models/marian/modeling_flax_marian.py +++ b/src/transformers/models/marian/modeling_flax_marian.py @@ -19,11 +19,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -883,7 +882,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1440,7 +1439,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 3937058a566016..d4dc4d53dd8485 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -1019,7 +1019,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1044,7 +1043,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 17a4f0f5d1b587..eecaf95f6cb903 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -817,7 +817,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1133,9 +1132,8 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): - if decoder_input_ids is None and decoder_inputs_embeds is None: use_cache = False @@ -1239,7 +1237,7 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): outputs = self.model( input_ids=input_ids, @@ -1461,9 +1459,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index c688733321be0d..7d2af76fc331c4 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -138,7 +138,7 @@ def __init__( model_max_length=512, sp_model_kwargs: Optional[Dict[str, Any]] = None, separate_vocabs=False, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py index aa667fa53b8977..935625b1fa1fa8 100644 --- a/src/transformers/models/markuplm/configuration_markuplm.py +++ b/src/transformers/models/markuplm/configuration_markuplm.py @@ -126,7 +126,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index b9a3b70326eac5..86c67f7fe427f8 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -640,7 +640,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py index d6251586ac67ca..51307d20eb5f3b 100644 --- a/src/transformers/models/markuplm/processing_markuplm.py +++ b/src/transformers/models/markuplm/processing_markuplm.py @@ -66,7 +66,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py index f7d0e445d0281e..f4eb0af4d9b756 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm.py +++ b/src/transformers/models/markuplm/tokenization_markuplm.py @@ -220,7 +220,7 @@ def __init__( pad_width=1001, pad_token_label=-100, only_label_first_subword=True, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -527,7 +527,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -676,7 +676,7 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( @@ -734,7 +734,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -857,7 +857,7 @@ def encode( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> List[int]: encoded_inputs = self.encode_plus( text=text, @@ -904,7 +904,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -970,7 +970,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -1024,7 +1024,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens, @@ -1315,8 +1315,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py index 4a0a0b9e64a9d3..8c1cb73afdaac7 100644 --- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py +++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py @@ -180,7 +180,7 @@ def __init__( pad_token_label=-100, only_label_first_subword=True, trim_offsets=False, - **kwargs + **kwargs, ): super().__init__( vocab_file=vocab_file, @@ -302,7 +302,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -321,6 +321,7 @@ def __call__( node_labels (`List[int]`, `List[List[int]]`, *optional*): Node-level integer labels (for token classification tasks). """ + # Input type checking for clearer error def _is_valid_text_input(t): if isinstance(t, str): @@ -450,7 +451,7 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( @@ -513,7 +514,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated, @@ -736,7 +737,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # make it a batched input # 2 options: diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py index b53e695db7ebc2..ea3e530ded5216 100644 --- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py @@ -20,16 +20,16 @@ from pprint import pformat from typing import Any, Dict, Iterator, List, Set, Tuple +import requests import torch import torchvision.transforms as T -from PIL import Image -from torch import Tensor, nn - -import requests from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.projects.deeplab import add_deeplab_config from huggingface_hub import hf_hub_download +from PIL import Image +from torch import Tensor, nn + from transformers import ( Mask2FormerConfig, Mask2FormerForUniversalSegmentation, @@ -624,7 +624,6 @@ def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src rename_keys = [] for i in range(self.config.decoder_layers - 1): - rename_keys.append( ( f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight", diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py index 01c267cdd48c53..eb93391fb354f8 100644 --- a/src/transformers/models/mask2former/image_processing_mask2former.py +++ b/src/transformers/models/mask2former/image_processing_mask2former.py @@ -391,7 +391,7 @@ def __init__( image_std: Union[float, List[float]] = None, ignore_index: Optional[int] = None, reduce_labels: bool = False, - **kwargs + **kwargs, ): if "size_divisibility" in kwargs: warnings.warn( @@ -466,7 +466,7 @@ def resize( size_divisor: int = 0, resample: PILImageResampling = PILImageResampling.BILINEAR, data_format=None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an @@ -644,7 +644,7 @@ def preprocess( reduce_labels: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: if "pad_and_return_pixel_mask" in kwargs: warnings.warn( @@ -789,7 +789,7 @@ def encode_inputs( ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 6fed5b05955bf8..ca9712639df2a9 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -1596,7 +1596,6 @@ def forward_post( encoder_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ): - # Masked(Cross)-Attention Block cross_attn_weights = None self_attn_weights = None @@ -1656,7 +1655,6 @@ def forward_pre( encoder_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ): - # Masked(Cross)-Attention Block cross_attn_weights = None self_attn_weights = None @@ -2016,7 +2014,6 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size) def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None): - mask_embeddings = self.mask_embedder(outputs.transpose(0, 1)) # Sum up over the channels @@ -2063,7 +2060,6 @@ def forward( output_hidden_states: bool = False, output_attentions: bool = False, ) -> Mask2FormerMaskedAttentionDecoderOutput: - multi_stage_features = [] multi_stage_positional_embeddings = [] size_list = [] diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py index 36e0746552c853..e48e3d120129d8 100644 --- a/src/transformers/models/maskformer/configuration_maskformer_swin.py +++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py @@ -113,7 +113,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-5, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py index c08591e044db9e..d56777d4527439 100644 --- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py @@ -19,16 +19,16 @@ from pprint import pformat from typing import Any, Dict, Iterator, List, Set, Tuple +import requests import torch import torchvision.transforms as T -from PIL import Image -from torch import Tensor, nn - -import requests from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import MetadataCatalog from detectron2.projects.deeplab import add_deeplab_config +from PIL import Image +from torch import Tensor, nn + from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerFeatureExtractor from transformers.models.maskformer.modeling_maskformer import ( MaskFormerConfig, @@ -106,7 +106,6 @@ def setup_cfg(args: Args): class OriginalMaskFormerConfigToOursConverter: def __call__(self, original_config: object) -> MaskFormerConfig: - model = original_config.MODEL mask_former = model.MASK_FORMER swin = model.SWIN @@ -557,7 +556,6 @@ def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_extractor: MaskFormerFeatureExtractor): with torch.no_grad(): - original_model = original_model.eval() our_model = our_model.eval() @@ -583,7 +581,6 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_e for original_model_feature, our_model_feature in zip( original_model_backbone_features.values(), our_model_output.encoder_hidden_states ): - assert torch.allclose( original_model_feature, our_model_feature, atol=1e-3 ), "The backbone features are not the same." @@ -635,7 +632,6 @@ def get_name(checkpoint_file: Path): if __name__ == "__main__": - parser = ArgumentParser( description="Command line to convert the original maskformers (with swin backbone) to our implementations." ) @@ -690,7 +686,6 @@ def get_name(checkpoint_file: Path): for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs( checkpoints_dir, config_dir ): - feature_extractor = OriginalMaskFormerConfigToFeatureExtractorConverter()( setup_cfg(Args(config_file=config_file)) ) diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py index f0f69f9aa83681..0657de9ee69e0d 100644 --- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py +++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py @@ -21,11 +21,11 @@ import pickle from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, ResNetConfig from transformers.utils import logging diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py index 59606b1a409ad0..4ec5f64f22ae28 100644 --- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py +++ b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py @@ -21,11 +21,11 @@ import pickle from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, SwinConfig from transformers.utils import logging diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py index 2a59a0f4db5719..6c3119fd30bff5 100644 --- a/src/transformers/models/maskformer/image_processing_maskformer.py +++ b/src/transformers/models/maskformer/image_processing_maskformer.py @@ -395,7 +395,7 @@ def __init__( image_std: Union[float, List[float]] = None, ignore_index: Optional[int] = None, do_reduce_labels: bool = False, - **kwargs + **kwargs, ): if "size_divisibility" in kwargs: warnings.warn( @@ -486,7 +486,7 @@ def resize( size_divisor: int = 0, resample: PILImageResampling = PILImageResampling.BILINEAR, data_format=None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an @@ -548,7 +548,7 @@ def convert_segmentation_map_to_binary_masks( instance_id_to_semantic_id: Optional[Dict[int, int]] = None, ignore_index: Optional[int] = None, reduce_labels: bool = False, - **kwargs + **kwargs, ): reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels ignore_index = ignore_index if ignore_index is not None else self.ignore_index @@ -665,7 +665,7 @@ def preprocess( do_reduce_labels: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: if "pad_and_return_pixel_mask" in kwargs: warnings.warn( @@ -820,7 +820,7 @@ def encode_inputs( ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py index 83f741dbf7b6d5..1a775f57fdfb91 100644 --- a/src/transformers/models/mbart/configuration_mbart.py +++ b/src/transformers/models/mbart/configuration_mbart.py @@ -134,7 +134,7 @@ def __init__( bos_token_id=0, eos_token_id=2, forced_eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py index 42660e260754c6..78375afce4fccc 100644 --- a/src/transformers/models/mbart/modeling_flax_mbart.py +++ b/src/transformers/models/mbart/modeling_flax_mbart.py @@ -19,11 +19,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -953,7 +952,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1507,7 +1506,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index ca440038960791..2548b08ed10bc7 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1068,7 +1068,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." @@ -1093,7 +1092,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1398,7 +1396,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index b222983331fb27..0f6e18b0b38616 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -803,7 +803,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1128,9 +1127,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFSeq2SeqModelOutput, tf.Tensor]: - if decoder_input_ids is None and decoder_inputs_embeds is None: use_cache = False @@ -1237,9 +1235,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]: - outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -1458,9 +1455,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py index b6b4173e50afdd..0c74175e33220e 100644 --- a/src/transformers/models/mbart/tokenization_mbart.py +++ b/src/transformers/models/mbart/tokenization_mbart.py @@ -94,9 +94,8 @@ def __init__( tgt_lang=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, additional_special_tokens=None, - **kwargs + **kwargs, ): - # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py index 0ac14033a44aa8..b5d24c28dc8ff9 100644 --- a/src/transformers/models/mbart/tokenization_mbart_fast.py +++ b/src/transformers/models/mbart/tokenization_mbart_fast.py @@ -107,7 +107,7 @@ def __init__( src_lang=None, tgt_lang=None, additional_special_tokens=None, - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py index 0a331b283760fa..628be52479d0c3 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50.py +++ b/src/transformers/models/mbart50/tokenization_mbart50.py @@ -125,7 +125,7 @@ def __init__( pad_token="", mask_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py index 1ab8ff06e2609f..6bf3b48b378ccf 100644 --- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py +++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py @@ -122,7 +122,7 @@ def __init__( unk_token="", pad_token="", mask_token="", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/mctct/configuration_mctct.py b/src/transformers/models/mctct/configuration_mctct.py index 1c84f2325928e9..6389f2238fc17e 100644 --- a/src/transformers/models/mctct/configuration_mctct.py +++ b/src/transformers/models/mctct/configuration_mctct.py @@ -144,7 +144,7 @@ def __init__( conv_channels=None, ctc_loss_reduction="sum", ctc_zero_infinity=False, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.vocab_size = vocab_size diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py index 2f1514054ef02c..d517e3caf85e08 100644 --- a/src/transformers/models/mctct/feature_extraction_mctct.py +++ b/src/transformers/models/mctct/feature_extraction_mctct.py @@ -90,7 +90,7 @@ def __init__( normalize_means=True, normalize_vars=True, return_attention_mask=False, - **kwargs + **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) @@ -247,7 +247,7 @@ def __call__( return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, sampling_rate: Optional[int] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py index 577f7868d5aefe..db9b67090ac727 100644 --- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py +++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py @@ -109,7 +109,7 @@ def __init__( pad_token_id=0, position_embedding_type="absolute", use_cache=True, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py index 19124a074b9af1..334b03c9335b6e 100644 --- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py +++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py @@ -184,7 +184,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): # For layernorm(s), simply store the layer norm. if op_name.endswith("layernorm"): - ln_name = "attention.ln" if op_name.startswith("input") else "ln" output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val @@ -192,7 +191,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "weight": - # Make sure the QKV pointer is nil. assert attention_qkv_weight is None, "" @@ -204,7 +202,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "bias": - # Make sure we read the weight tensor. assert attention_qkv_weight is not None, "" @@ -232,7 +229,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): # Copy weights and biases as is. elif weight_or_bias in ["weight", "bias"]: - out_name = megatron_to_transformers[op_name] output_state_dict[layer_name + out_name + weight_or_bias] = val diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 636130b6f5a044..5b7889adbc94a0 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -543,7 +543,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1120,7 +1119,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForCausalLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"] @@ -1266,7 +1264,6 @@ def _reorder_cache(self, past, beam_idx): @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING) class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"] @@ -1375,7 +1372,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"predictions"] def __init__(self, config): @@ -1401,7 +1397,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **kwargs + **kwargs, ) -> Union[Tuple, NextSentencePredictorOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): @@ -1672,7 +1668,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1753,7 +1748,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py index f8fb1e7c8f3bcc..ccb8efb0d5d42f 100644 --- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py +++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py @@ -475,7 +475,6 @@ def convert_checkpoint_from_megatron_to_transformers(args): # For layernorm(s), simply store the layer norm. if op_name.endswith("layernorm"): - ln_name = "ln_1" if op_name.startswith("input") else "ln_2" output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = params @@ -483,7 +482,6 @@ def convert_checkpoint_from_megatron_to_transformers(args): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "weight": - # Insert a tensor of 1x1xDxD bias. causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=dtype)).view( 1, 1, n_positions, n_positions @@ -510,7 +508,6 @@ def convert_checkpoint_from_megatron_to_transformers(args): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "bias": - out_val = megatron_to_transformers_fix_query_key_value_ordering( params, checkpoint_version, 3, heads, hidden_size_per_head ) @@ -519,13 +516,11 @@ def convert_checkpoint_from_megatron_to_transformers(args): # Transpose the weights. elif weight_or_bias == "weight": - out_name = megatron_to_transformers[op_name] output_state_dict[layer_name + out_name + "weight"] = params.transpose(0, 1) # Copy the bias. elif weight_or_bias == "bias": - out_name = megatron_to_transformers[op_name] output_state_dict[layer_name + out_name + "bias"] = params diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py index 778b1384a28b36..db2774e90c87df 100644 --- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py +++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py @@ -179,7 +179,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): # For layernorm(s), simply store the layer norm. if op_name.endswith("layernorm"): - ln_name = "ln_1" if op_name.startswith("input") else "ln_2" output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val @@ -187,7 +186,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "weight": - # Insert a tensor of 1x1xDxD bias. causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view( 1, 1, n_positions, n_positions @@ -208,20 +206,17 @@ def convert_megatron_checkpoint(args, input_state_dict, config): elif ( op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value" ) and weight_or_bias == "bias": - out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head) # Store. No change of shape. output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val # Transpose the weights. elif weight_or_bias == "weight": - out_name = megatron_to_transformers[op_name] output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1) # Copy the bias. elif weight_or_bias == "bias": - out_name = megatron_to_transformers[op_name] output_state_dict[layer_name + out_name + "bias"] = val @@ -276,7 +271,6 @@ def main(): # Read the config, or default to the model released by NVIDIA. if args.config_file == "": - if ds_args is not None: if ds_args.bias_gelu_fusion: activation_function = "gelu_fast" diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py index fee7666a51c706..58cc9f11ab7e3e 100644 --- a/src/transformers/models/mluke/tokenization_mluke.py +++ b/src/transformers/models/mluke/tokenization_mluke.py @@ -23,7 +23,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np - import sentencepiece as spm from ...tokenization_utils import PreTrainedTokenizer @@ -388,7 +387,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -534,9 +533,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -617,7 +615,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -710,7 +708,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa ) if entities is not None: - if not isinstance(entities, list): raise ValueError("If you specify entities, they should be given as a list") @@ -729,7 +726,7 @@ def _create_input_sequence( entities_pair: Optional[EntityInput] = None, entity_spans: Optional[EntitySpanInput] = None, entity_spans_pair: Optional[EntitySpanInput] = None, - **kwargs + **kwargs, ) -> Tuple[list, list, list, list, list, list]: def get_input_ids(text): tokens = self.tokenize(text, **kwargs) @@ -770,7 +767,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans): first_entity_token_spans, second_entity_token_spans = None, None if self.task is None: - if entity_spans is None: first_ids = get_input_ids(text) else: @@ -853,7 +849,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans): first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:] elif self.task == "entity_span_classification": - if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)): raise ValueError( "Entity spans should be provided as a list of tuples, " @@ -984,7 +979,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids, diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py index 7034cdb2769b14..afe6c3b3d92798 100644 --- a/src/transformers/models/mobilebert/configuration_mobilebert.py +++ b/src/transformers/models/mobilebert/configuration_mobilebert.py @@ -136,7 +136,7 @@ def __init__( normalization_type="no_norm", classifier_activation=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index c74096ae4e3453..8e9ff45d6acf70 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -1035,7 +1035,6 @@ def forward( @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING) class MobileBertForMaskedLM(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [ "cls.predictions.decoder.weight", @@ -1349,7 +1348,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1553,7 +1551,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing class MobileBertForTokenClassification(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py index 023a5f74dabcef..0ccf9efe02cc5b 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py @@ -122,7 +122,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -170,7 +170,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py index 36ea9c61e48c6a..6bac366d237859 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py @@ -101,7 +101,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py index f13ed1b2ea3308..6e367874b760a4 100644 --- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py @@ -91,7 +91,7 @@ def __init__( classifier_dropout_prob=0.999, initializer_range=0.02, layer_norm_eps=0.001, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -110,7 +110,6 @@ def __init__( class MobileNetV1OnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py index c00ec9c703e067..66533cb8d034c1 100644 --- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py @@ -20,11 +20,11 @@ import re from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( MobileNetV1Config, MobileNetV1FeatureExtractor, diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py index 9843f600bef27b..c332b96c86a8a2 100644 --- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py @@ -98,7 +98,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 256} @@ -122,7 +122,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -149,7 +149,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any @@ -194,7 +194,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py index 5c46d4c10a44e6..73003c9ded9ffc 100644 --- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py @@ -115,7 +115,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=0.001, semantic_loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -140,7 +140,6 @@ def __init__( class MobileNetV2OnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py index 70a00d7d23392e..8b216aecb83d8c 100644 --- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py @@ -20,11 +20,11 @@ import re from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( MobileNetV2Config, MobileNetV2ForImageClassification, diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py index 343152ebde2d73..7b24547749a16f 100644 --- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py @@ -103,7 +103,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 256} @@ -127,7 +127,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -154,7 +154,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any @@ -201,7 +201,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py index 83406c96d830c2..fe782c39821a93 100644 --- a/src/transformers/models/mobilevit/configuration_mobilevit.py +++ b/src/transformers/models/mobilevit/configuration_mobilevit.py @@ -136,7 +136,7 @@ def __init__( atrous_rates=[6, 12, 18], aspp_dropout_prob=0.1, semantic_loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -166,7 +166,6 @@ def __init__( class MobileViTOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py index bc61f8822efa0f..50da675851bcb0 100644 --- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py +++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( MobileViTConfig, MobileViTFeatureExtractor, diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py index 7cf24216fe2b7a..b600009c2eada9 100644 --- a/src/transformers/models/mobilevit/image_processing_mobilevit.py +++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py @@ -117,7 +117,7 @@ def __init__( do_center_crop: bool = True, crop_size: Dict[str, int] = None, do_flip_channel_order: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -140,7 +140,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PIL.Image.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -167,7 +167,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to size `(size["height], size["width"])`. If the input size is smaller than `size` along @@ -191,7 +191,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py index e7f44b222b67b0..1b06f36536d6b9 100644 --- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py @@ -91,7 +91,7 @@ def __init__( dilation: int = 1, use_normalization: bool = True, use_activation: Union[bool, str] = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) logger.warning( @@ -197,7 +197,7 @@ def __init__( out_channels: int, stride: int = 1, num_stages: int = 1, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) @@ -383,7 +383,7 @@ def __init__( hidden_size: int, num_stages: int, dilation: int = 1, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) self.patch_width = config.patch_size @@ -851,7 +851,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]: - output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training) return output diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index cd9515aa4c8933..6dc145fdccc542 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -163,7 +163,6 @@ def forward( output_attentions=False, **kwargs, ): - q = self.q(hidden_states) k = self.k(hidden_states) v = self.v(hidden_states) @@ -480,7 +479,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: MPNET_START_DOCSTRING, ) class MPNetModel(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] def __init__(self, config, add_pooling_layer=True): diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index fd944ce678129d..48866e21d4d8f8 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -510,7 +510,6 @@ def call( return_dict=None, training=False, ): - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -781,7 +780,6 @@ def call(self, hidden_states): @add_start_docstrings("""MPNet Model with a `language modeling` head on top.""", MPNET_START_DOCSTRING) class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss): - _keys_to_ignore_on_load_missing = [r"pooler"] def __init__(self, config, *inputs, **kwargs): @@ -891,7 +889,6 @@ def call(self, features, training=False): MPNET_START_DOCSTRING, ) class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassificationLoss): - _keys_to_ignore_on_load_missing = [r"pooler"] def __init__(self, config, *inputs, **kwargs): @@ -1087,7 +1084,6 @@ def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoic MPNET_START_DOCSTRING, ) class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificationLoss): - _keys_to_ignore_on_load_missing = [r"pooler"] def __init__(self, config, *inputs, **kwargs): @@ -1169,7 +1165,6 @@ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOu MPNET_START_DOCSTRING, ) class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLoss): - _keys_to_ignore_on_load_missing = [r"pooler"] def __init__(self, config, *inputs, **kwargs): diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py index 28d8b7096ae118..1f5ad2f41aae41 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet.py +++ b/src/transformers/models/mpnet/tokenization_mpnet.py @@ -145,7 +145,7 @@ def __init__( mask_token="", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -205,7 +205,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py index f2fe4fe4fe8f0d..288c69c62b3cd5 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py +++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py @@ -124,7 +124,7 @@ def __init__( mask_token="", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py index d9232c94629db2..168f0eb1cabefc 100644 --- a/src/transformers/models/mt5/configuration_mt5.py +++ b/src/transformers/models/mt5/configuration_mt5.py @@ -91,7 +91,7 @@ def __init__( pad_token_id=0, eos_token_id=1, decoder_start_token_id=0, - **kwargs + **kwargs, ): super().__init__( is_encoder_decoder=is_encoder_decoder, diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index dac08695b6b333..07fab1ce54bb2a 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -117,7 +117,6 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): - # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -529,7 +528,6 @@ def forward( output_attentions=False, return_dict=True, ): - if past_key_value is not None: if not self.is_decoder: logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") @@ -1755,9 +1753,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past is used if past_key_values is not None: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py index 63a006b8e4292f..546da24954c11f 100644 --- a/src/transformers/models/mvp/configuration_mvp.py +++ b/src/transformers/models/mvp/configuration_mvp.py @@ -138,7 +138,7 @@ def __init__( use_prompt=False, prompt_length=100, prompt_mid_dim=800, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 8823bf17197c0c..dde522535d7a0c 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -1211,7 +1211,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1238,7 +1237,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1353,7 +1351,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: - # different to other models, Mvp automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: @@ -1560,7 +1557,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py index 3d5d606d63b5a6..98d373188e0535 100644 --- a/src/transformers/models/mvp/tokenization_mvp.py +++ b/src/transformers/models/mvp/tokenization_mvp.py @@ -180,7 +180,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py index 00b7a5c6651e6f..28dd1ea942df6b 100644 --- a/src/transformers/models/mvp/tokenization_mvp_fast.py +++ b/src/transformers/models/mvp/tokenization_mvp_fast.py @@ -149,7 +149,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py index 32272e62c56274..35f14768a25120 100644 --- a/src/transformers/models/nat/configuration_nat.py +++ b/src/transformers/models/nat/configuration_nat.py @@ -114,7 +114,7 @@ def __init__( layer_norm_eps=1e-5, layer_scale_init_value=0.0, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index c2e445b7ae994b..d455d9e5ee2b5b 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -329,7 +329,6 @@ def forward( hidden_states: torch.Tensor, output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor]: - query_layer = self.transpose_for_scores(self.query(hidden_states)) key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) diff --git a/src/transformers/models/nezha/configuration_nezha.py b/src/transformers/models/nezha/configuration_nezha.py index 7c07a0d9bc2b2d..8d191b7d96ec60 100644 --- a/src/transformers/models/nezha/configuration_nezha.py +++ b/src/transformers/models/nezha/configuration_nezha.py @@ -86,7 +86,7 @@ def __init__( bos_token_id=2, eos_token_id=3, use_cache=True, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index b8bf1ac16487fa..98712f0aced535 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -570,7 +570,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1138,7 +1137,6 @@ def forward( @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING) class NezhaForMaskedLM(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"] @@ -1540,7 +1538,6 @@ def forward( NEZHA_START_DOCSTRING, ) class NezhaForTokenClassification(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1622,7 +1619,6 @@ def forward( NEZHA_START_DOCSTRING, ) class NezhaForQuestionAnswering(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index 6a326fd3ca10b7..ac2aa2380b1b59 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -140,9 +140,8 @@ def __init__( tgt_lang=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, additional_special_tokens=None, - **kwargs + **kwargs, ): - # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 1afe27f43b4e53..7c6979e295b956 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -151,7 +151,7 @@ def __init__( src_lang=None, tgt_lang=None, additional_special_tokens=None, - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py index 9a1cf726ea73c8..98b3e511ac0e21 100644 --- a/src/transformers/models/nystromformer/configuration_nystromformer.py +++ b/src/transformers/models/nystromformer/configuration_nystromformer.py @@ -112,7 +112,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py index d8e2cfac10190b..8d5a52bdbf82da 100644 --- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py @@ -78,7 +78,6 @@ def convert_checkpoint_helper(config, orig_state_dict): def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path): - orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"] config = NystromformerConfig.from_json_file(nystromformer_config_file) model = NystromformerForMaskedLM(config) diff --git a/src/transformers/models/oneformer/convert_to_hf_oneformer.py b/src/transformers/models/oneformer/convert_to_hf_oneformer.py index 074cc659b2688f..bfe2aee5e22270 100644 --- a/src/transformers/models/oneformer/convert_to_hf_oneformer.py +++ b/src/transformers/models/oneformer/convert_to_hf_oneformer.py @@ -23,13 +23,12 @@ from pprint import pformat from typing import Any, Dict, Iterator, List, Set, Tuple +import requests import torch import torchvision.transforms as T from PIL import Image from torch import Tensor, nn -import requests - try: from detectron2.checkpoint import DetectionCheckpointer diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py index af36dbe0ab94df..b1e93c9e393811 100644 --- a/src/transformers/models/oneformer/image_processing_oneformer.py +++ b/src/transformers/models/oneformer/image_processing_oneformer.py @@ -19,8 +19,8 @@ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import numpy as np - from huggingface_hub import hf_hub_download + from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from transformers.image_transforms import ( PaddingMode, @@ -403,7 +403,7 @@ def __init__( repo_path: str = "shi-labs/oneformer_demo", class_info_file: str = None, num_text: Optional[int] = None, - **kwargs + **kwargs, ): if "max_size" in kwargs: self._max_size = kwargs.pop("max_size") @@ -443,7 +443,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format=None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an @@ -507,7 +507,7 @@ def convert_segmentation_map_to_binary_masks( instance_id_to_semantic_id: Optional[Dict[int, int]] = None, ignore_index: Optional[int] = None, reduce_labels: bool = False, - **kwargs + **kwargs, ): reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels ignore_index = ignore_index if ignore_index is not None else self.ignore_index @@ -619,7 +619,7 @@ def preprocess( do_reduce_labels: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: if "pad_and_return_pixel_mask" in kwargs: warnings.warn( @@ -881,7 +881,7 @@ def encode_inputs( ignore_index: Optional[int] = None, reduce_labels: bool = False, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ): """ Pad images up to the largest image in a batch and create a corresponding `pixel_mask`. diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 2dd04c0a4303f3..84539b83d96db7 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -2507,7 +2507,7 @@ def __init__( visual_dim=1024, dropout=0.1, layer_norm_eps=1e-05, - **kwargs + **kwargs, ): super().__init__() diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py index b08c72daf0492c..df16be211c5d22 100644 --- a/src/transformers/models/openai/configuration_openai.py +++ b/src/transformers/models/openai/configuration_openai.py @@ -136,7 +136,7 @@ def __init__( summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.n_positions = n_positions diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index fbe63a001e3856..4bd4f506e92085 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -248,7 +248,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[Tuple, TFBaseModelOutput]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -544,7 +543,6 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = False, ) -> Union[Tuple, TFBaseModelOutput]: - outputs = self.transformer( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py index f8b5bc4d8faf9f..df13b32019984e 100644 --- a/src/transformers/models/opt/configuration_opt.py +++ b/src/transformers/models/opt/configuration_opt.py @@ -118,7 +118,7 @@ def __init__( eos_token_id=2, enable_bias=True, layer_norm_elementwise_affine=True, - **kwargs + **kwargs, ): super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py index db9154b33b277b..b7038008f5d4d6 100644 --- a/src/transformers/models/opt/modeling_flax_opt.py +++ b/src/transformers/models/opt/modeling_flax_opt.py @@ -309,7 +309,6 @@ def __call__( output_attentions: bool = True, deterministic: bool = True, ) -> Tuple[jnp.ndarray]: - residual = hidden_states # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention @@ -527,7 +526,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -666,7 +665,6 @@ def __call__( deterministic: bool = True, init_cache=False, ): - decoder_outputs = self.decoder( input_ids=input_ids, attention_mask=attention_mask, @@ -726,7 +724,6 @@ def __call__( return_dict: bool = True, deterministic: bool = True, ): - outputs = self.model( input_ids, attention_mask, diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index d98b2ff058ee9f..5269c67de79c9e 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -391,7 +391,6 @@ def forward( OPT_START_DOCSTRING, ) class OPTPreTrainedModel(PreTrainedModel): - config_class = OPTConfig base_model_prefix = "model" supports_gradient_checkpointing = True @@ -671,7 +670,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -693,7 +691,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -771,7 +768,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index c9f4c087b387a6..2fcbd444a1d141 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -742,9 +742,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -815,9 +814,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -915,7 +913,7 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[TFCausalLMOutputWithPast, Tuple[tf.Tensor]]: r""" Args: diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index d84cf64937ddfb..978af9dc3d33d0 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -109,7 +109,7 @@ def __init__( pad_token_id=0, bos_token_id=49406, eos_token_id=49407, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -127,7 +127,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from OwlViTConfig @@ -212,7 +211,7 @@ def __init__( attention_dropout=0.0, initializer_range=0.02, initializer_factor=1.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -231,7 +230,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from OwlViTConfig @@ -281,7 +279,7 @@ def __init__( projection_dim=512, logit_scale_init_value=2.6592, return_dict=True, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -375,7 +373,6 @@ def generate_dummy_inputs( seq_length: int = -1, framework: Optional["TensorType"] = None, ) -> Mapping[str, Any]: - text_input_dict = super().generate_dummy_inputs( processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework ) diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py index 09942fa3928d0b..d2ea6b0a6c516d 100644 --- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py +++ b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py @@ -18,14 +18,14 @@ import argparse import collections -import torch -import torch.nn as nn - import jax import jax.numpy as jnp +import torch +import torch.nn as nn from clip.model import CLIP from flax.training import checkpoints from huggingface_hub import Repository + from transformers import ( CLIPTokenizer, OwlViTConfig, @@ -314,7 +314,6 @@ def convert_clip_backbone(flax_params, torch_config): # Copy flax CLIP backbone params to PyTorch params for name, param in new_torch_params.items(): if name in torch_clip_params.keys(): - new_param = torch.from_numpy(new_torch_params[name]) torch_clip_params[name].copy_(new_param) else: diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py index 650a2d787e636e..ee5d230ea3c341 100644 --- a/src/transformers/models/owlvit/image_processing_owlvit.py +++ b/src/transformers/models/owlvit/image_processing_owlvit.py @@ -136,7 +136,7 @@ def __init__( do_normalize=True, image_mean=None, image_std=None, - **kwargs + **kwargs, ): size = size if size is not None else {"height": 768, "width": 768} size = get_size_dict(size, default_to_square=True) @@ -169,7 +169,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to a certain size. @@ -185,7 +185,7 @@ def center_crop( image: np.ndarray, crop_size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to a certain size. @@ -201,7 +201,7 @@ def rescale( image: np.ndarray, rescale_factor: float, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Rescale an image by a certain factor. @@ -214,7 +214,7 @@ def normalize( mean: List[float], std: List[float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image with a certain mean and standard deviation. @@ -236,7 +236,7 @@ def preprocess( image_std: Optional[Union[float, List[float]]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Prepares an image or batch of images for the model. diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 0a7599639bf38d..1a0f3ed0f6d817 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -1276,7 +1276,6 @@ def forward( query_embeds: Optional[torch.FloatTensor], query_mask: Optional[torch.Tensor], ) -> Tuple[torch.FloatTensor]: - image_class_embeds = self.dense0(image_embeds) if query_embeds is None: device = image_class_embeds.device @@ -1408,7 +1407,6 @@ def image_text_embedder( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[torch.FloatTensor]: - # Encode text and image outputs = self.owlvit( pixel_values=pixel_values, @@ -1478,7 +1476,6 @@ def image_embedder( def embed_image_query( self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor ) -> torch.FloatTensor: - _, class_embeds = self.class_predictor(query_image_features) pred_boxes = self.box_predictor(query_image_features, query_feature_map) pred_boxes_as_corners = center_to_corners_format(pred_boxes) diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py index f38d61ff8a027f..fd7de9a1a490b9 100644 --- a/src/transformers/models/pegasus/configuration_pegasus.py +++ b/src/transformers/models/pegasus/configuration_pegasus.py @@ -126,7 +126,7 @@ def __init__( pad_token_id=0, eos_token_id=1, forced_eos_token_id=1, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py index 9254a0ba941100..739e075233f764 100644 --- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py +++ b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py @@ -46,7 +46,6 @@ def rename_state_dict_key(k): - for pegasus_name, hf_name in PATTERNS: k = k.replace(pegasus_name, hf_name) return k diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py index 75d38d59ef2214..b39e2c437e537e 100644 --- a/src/transformers/models/pegasus/modeling_flax_pegasus.py +++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py @@ -20,11 +20,10 @@ from functools import partial from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -903,7 +902,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1458,7 +1457,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 18e17ab5732de7..d4cc8f381e1437 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -1069,7 +1069,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1094,7 +1093,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1458,7 +1456,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index a8c27b6497b909..f38dbde5f37afb 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -821,7 +821,6 @@ def call( # encoder layers for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: encoder_states = encoder_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) @@ -1142,9 +1141,8 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): - if decoder_input_ids is None and decoder_inputs_embeds is None: use_cache = False @@ -1248,9 +1246,8 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs + **kwargs, ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]: - outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, @@ -1471,9 +1468,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past_key_values is used if past_key_values is not None: decoder_input_ids = decoder_input_ids[:, -1:] diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 77127125bb483b..814602fac88d0d 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -113,7 +113,7 @@ def __init__( additional_special_tokens=None, offset=103, # entries 2 - 104 are only used for pretraining sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.offset = offset if additional_special_tokens is not None: diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py index 22c6018385f6d0..d0345fe60cab91 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py +++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py @@ -108,7 +108,7 @@ def __init__( mask_token_sent="", additional_special_tokens=None, offset=103, # entries 2 - 104 are only used for pretraining - **kwargs + **kwargs, ): self.offset = offset diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py index 1fc0bfa837454d..c393a6b8a91044 100644 --- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py +++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py @@ -135,7 +135,7 @@ def __init__( num_global_tokens=32, block_size=512, stagger_local_blocks=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index e33cf1cd3fdee1..f09a0447fd75fb 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1310,7 +1310,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1333,7 +1332,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1664,7 +1662,7 @@ def prepare_inputs_for_generation( attention_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py index a4b475532ff87a..9a7c3457882049 100644 --- a/src/transformers/models/perceiver/configuration_perceiver.py +++ b/src/transformers/models/perceiver/configuration_perceiver.py @@ -145,7 +145,7 @@ def __init__( audio_samples_per_frame=1920, samples_per_patch=16, output_shape=[1, 16, 224, 224], - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py index d1a4fd14e57602..9c925313a343d1 100644 --- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py +++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py @@ -20,13 +20,13 @@ import pickle from pathlib import Path +import haiku as hk import numpy as np +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import haiku as hk -import requests -from huggingface_hub import hf_hub_download from transformers import ( PerceiverConfig, PerceiverFeatureExtractor, diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py index 00bf238865569e..59b7fd5332bf6f 100644 --- a/src/transformers/models/perceiver/image_processing_perceiver.py +++ b/src/transformers/models/perceiver/image_processing_perceiver.py @@ -95,7 +95,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256} @@ -120,7 +120,7 @@ def center_crop( crop_size: Dict[str, int], size: Optional[int] = None, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to `(size["height"] / crop_size["height"] * min_dim, size["width"] / crop_size["width"] * @@ -155,7 +155,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PIL.Image.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. @@ -182,7 +182,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -203,7 +203,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index 00d5bcb01ba2dc..9a6b6bad5f11db 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -2442,7 +2442,7 @@ def __init__( output_num_channels: int, min_padding_size: Optional[int] = 2, subsampled_index_dims: Optional[Dict[str, PerceiverAbstractDecoder]] = None, - **decoder_kwargs + **decoder_kwargs, ) -> None: super().__init__() self.modalities = nn.ModuleDict(modalities) diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py index 958d8a9c1d6115..cbfd9e64150243 100644 --- a/src/transformers/models/perceiver/tokenization_perceiver.py +++ b/src/transformers/models/perceiver/tokenization_perceiver.py @@ -66,9 +66,8 @@ def __init__( cls_token="[CLS]", sep_token="[SEP]", model_max_length=2048, - **kwargs + **kwargs, ) -> None: - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py index a37a5645ae424e..dd294ac43a68d5 100644 --- a/src/transformers/models/phobert/tokenization_phobert.py +++ b/src/transformers/models/phobert/tokenization_phobert.py @@ -129,7 +129,7 @@ def __init__( unk_token="", pad_token="", mask_token="", - **kwargs + **kwargs, ): super().__init__( bos_token=bos_token, diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py index e3d6c7fbe9fbf1..25f4c31c577859 100644 --- a/src/transformers/models/plbart/configuration_plbart.py +++ b/src/transformers/models/plbart/configuration_plbart.py @@ -132,7 +132,7 @@ def __init__( bos_token_id=0, eos_token_id=2, forced_eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index d0828941ffe75e..ed506a69356f8c 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1047,7 +1047,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1072,7 +1071,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1371,7 +1369,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, encoder_outputs: Optional[List[torch.FloatTensor]] = None, - **kwargs # TODO: Check if this is needed. It is unused? + **kwargs, # TODO: Check if this is needed. It is unused? ) -> Dict[str, Any]: # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py index 94ec77c468c971..bf47538eaabdaf 100644 --- a/src/transformers/models/plbart/tokenization_plbart.py +++ b/src/transformers/models/plbart/tokenization_plbart.py @@ -189,7 +189,7 @@ def __init__( tgt_lang=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, additional_special_tokens=None, - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py index c55f13b80c9623..550c387adcefe8 100644 --- a/src/transformers/models/poolformer/configuration_poolformer.py +++ b/src/transformers/models/poolformer/configuration_poolformer.py @@ -111,7 +111,7 @@ def __init__( use_layer_scale=True, layer_scale_init_value=1e-5, initializer_range=0.02, - **kwargs + **kwargs, ): self.num_channels = num_channels self.patch_size = patch_size @@ -133,7 +133,6 @@ def __init__( class PoolFormerOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py index 4ab0d2bfb3d457..d00a9970aaae66 100644 --- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py +++ b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py @@ -19,11 +19,11 @@ from collections import OrderedDict from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import PoolFormerConfig, PoolFormerFeatureExtractor, PoolFormerForImageClassification from transformers.utils import logging diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py index d78bf30327b80e..2548c03da469d2 100644 --- a/src/transformers/models/poolformer/image_processing_poolformer.py +++ b/src/transformers/models/poolformer/image_processing_poolformer.py @@ -116,7 +116,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -143,7 +143,7 @@ def resize( crop_pct: Optional[float] = None, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -203,7 +203,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to (size["height"], size["width"]). If the input size is smaller than `crop_size` along @@ -227,7 +227,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -248,7 +248,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py index 40f5939d99bc7d..35988eaa132128 100644 --- a/src/transformers/models/prophetnet/configuration_prophetnet.py +++ b/src/transformers/models/prophetnet/configuration_prophetnet.py @@ -132,7 +132,7 @@ def __init__( pad_token_id: Optional[int] = 0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.hidden_size = hidden_size diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py index 638a71ef2fa423..c9e64c06ef769a 100644 --- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py @@ -19,8 +19,6 @@ from torch import nn -from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging - # transformers_old should correspond to branch `save_old_prophetnet_model_structure` here # original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively from transformers_old.modeling_prophetnet import ( @@ -30,6 +28,8 @@ XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld, ) +from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging + logger = logging.get_logger(__name__) logging.set_verbosity_info() diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index baf7e1dc4ebb4c..7dbef7c6e29245 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -666,7 +666,6 @@ def forward( past_key_value: Optional[Tuple[Tensor]] = None, output_attentions: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: - batch_size, tgt_len, hidden_size = hidden_states.size() # if key_value_states are provided this layer is used as a cross-attention layer @@ -1589,7 +1588,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py index 05e03ad4881a0a..36104d49fb2d8b 100644 --- a/src/transformers/models/prophetnet/tokenization_prophetnet.py +++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py @@ -340,7 +340,7 @@ def __init__( mask_token: Optional[str] = "[MASK]", tokenize_chinese_chars: Optional[bool] = True, strip_accents: Optional[bool] = None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -385,7 +385,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py index 090617a6308f95..c4f8c1559e61da 100644 --- a/src/transformers/models/qdqbert/configuration_qdqbert.py +++ b/src/transformers/models/qdqbert/configuration_qdqbert.py @@ -105,7 +105,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index 40e581d4f6da12..ba24b897234363 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -574,7 +574,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1013,7 +1012,6 @@ def forward( """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING ) class QDQBertLMHeadModel(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] @@ -1145,7 +1143,7 @@ def prepare_inputs_for_generation( input_ids: Optional[torch.LongTensor], past_key_values=None, attention_mask: Optional[torch.Tensor] = None, - **model_kwargs + **model_kwargs, ): input_shape = input_ids.shape # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly @@ -1167,7 +1165,6 @@ def _reorder_cache(self, past, beam_idx): @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING) class QDQBertForMaskedLM(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] @@ -1571,7 +1568,6 @@ def forward( QDQBERT_START_DOCSTRING, ) class QDQBertForTokenClassification(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1652,7 +1648,6 @@ def forward( QDQBERT_START_DOCSTRING, ) class QDQBertForQuestionAnswering(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py index 109588eadbdfaa..59d2951deffe1c 100644 --- a/src/transformers/models/rag/configuration_rag.py +++ b/src/transformers/models/rag/configuration_rag.py @@ -112,7 +112,7 @@ def __init__( output_retrieved=False, use_cache=True, forced_eos_token_id=None, - **kwargs + **kwargs, ): super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 52aca10d5c03ed..df6b56d05c6eee 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -246,7 +246,7 @@ def from_pretrained_question_encoder_generator( question_encoder_pretrained_model_name_or_path: str = None, generator_pretrained_model_name_or_path: str = None, retriever: RagRetriever = None, - **kwargs + **kwargs, ) -> PreTrainedModel: r""" Instantiates an question encoder and a generator from one or two base classes of the library from pretrained @@ -588,7 +588,6 @@ def forward( ) # encoder_outputs are pre-computed during RAG-token generation if encoder_outputs is None: - if has_to_retrieve: question_enc_outputs = self.question_encoder( input_ids, attention_mask=attention_mask, return_dict=True @@ -603,7 +602,6 @@ def forward( return_tensors="pt", ) if self.context_encoder_training: - ( context_input_ids, context_attention_mask, @@ -789,7 +787,7 @@ def forward( reduce_loss: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, n_docs: Optional[int] = None, - **kwargs # needs kwargs for generation + **kwargs, # needs kwargs for generation ) -> RetrievAugLMMarginOutput: r""" exclude_bos_score (`bool`, *optional*): @@ -921,7 +919,7 @@ def generate( num_return_sequences: Optional[int] = None, # defaults to 1 num_beams: Optional[int] = None, # defaults to 1 n_docs: Optional[int] = None, - **model_kwargs + **model_kwargs, ) -> torch.LongTensor: """ Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation @@ -1176,7 +1174,7 @@ def prepare_inputs_for_generation( encoder_outputs=None, doc_scores=None, n_docs=None, - **kwargs + **kwargs, ): if past_key_values is not None: # if past is defined use only last decoder_input_ids @@ -1225,7 +1223,6 @@ def _reorder_stacked(hidden_states, new_order): return reordered_past def marginalize(self, seq_logits, doc_scores, n_docs=None): - n_docs = n_docs if n_docs is not None else self.config.n_docs # RAG-token marginalization @@ -1257,7 +1254,7 @@ def forward( reduce_loss: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, n_docs: Optional[int] = None, - **kwargs # needs kwargs for generation + **kwargs, # needs kwargs for generation ) -> RetrievAugLMMarginOutput: r""" do_marginalize (`bool`, *optional*): @@ -1390,7 +1387,7 @@ def generate( prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]] = None, logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(), stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(), - **kwargs + **kwargs, ) -> torch.LongTensor: """ Implements RAG token decoding. diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index cda15a8b4454ae..0ea2e554489b6f 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -232,7 +232,7 @@ def from_pretrained_question_encoder_generator( generator_pretrained_model_name_or_path: str = None, retriever: RagRetriever = None, *model_args, - **kwargs + **kwargs, ) -> TFPreTrainedModel: r""" Instantiates an question encoder and a generator from one or two base classes of the library from pretrained @@ -491,7 +491,6 @@ def from_pretrained_question_encoder_generator( @add_start_docstrings_to_model_forward(RAG_START_DOCSTRING) class TFRagModel(TFRagPreTrainedModel): - load_weight_prefix = "tf_rag_model_1" def __init__( @@ -562,7 +561,7 @@ def call( n_docs: Optional[int] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs + **kwargs, ): r""" Returns: @@ -602,7 +601,6 @@ def call( # encoder_outputs are pre-computed during RAG-token generation if encoder_outputs is None: - if has_to_retrieve: question_enc_outputs = self.question_encoder( input_ids, attention_mask=attention_mask, return_dict=True, training=training @@ -726,7 +724,6 @@ def call( RAG_START_DOCSTRING, ) class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss): - load_weight_prefix = "tf_rag_token_for_generation_1/rag" def __init__( @@ -771,7 +768,7 @@ def prepare_inputs_for_generation( encoder_outputs=None, doc_scores=None, n_docs=None, - **kwargs + **kwargs, ): if past_key_values is not None: # if past is defined use only last decoder_input_ids @@ -863,7 +860,7 @@ def call( reduce_loss: Optional[bool] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs # needs kwargs for generation + **kwargs, # needs kwargs for generation ): r""" do_marginalize (`bool`, *optional*): @@ -1004,7 +1001,7 @@ def generate( n_docs=None, generation_config=None, logits_processor=TFLogitsProcessorList(), - **kwargs + **kwargs, ): """ Implements TFRAG token decoding. @@ -1299,7 +1296,6 @@ def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True, RAG_START_DOCSTRING, ) class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss): - load_weight_prefix = "tf_rag_sequence_for_generation_1/rag" def __init__( @@ -1370,7 +1366,7 @@ def call( reduce_loss: Optional[bool] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs # needs kwargs for generation + **kwargs, # needs kwargs for generation ) -> Union[Tuple[tf.Tensor], TFRetrievAugLMMarginOutput]: r""" exclude_bos_score (`bool`, *optional*): @@ -1592,7 +1588,7 @@ def generate( num_return_sequences=None, # defaults to 1 num_beams=None, # defaults to 1 n_docs=None, - **model_kwargs + **model_kwargs, ): """ Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py index 5f15058e5f83bd..bef2baf05f202d 100644 --- a/src/transformers/models/realm/configuration_realm.py +++ b/src/transformers/models/realm/configuration_realm.py @@ -153,7 +153,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 1231cea1b9b66b..a7423714944c4c 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -577,7 +577,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1526,7 +1525,6 @@ def forward( @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING) class RealmReader(RealmPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"] def __init__(self, config): diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py index db6c8c7246be31..4bdf19454f0815 100644 --- a/src/transformers/models/realm/retrieval_realm.py +++ b/src/transformers/models/realm/retrieval_realm.py @@ -18,8 +18,8 @@ from typing import Optional, Union import numpy as np - from huggingface_hub import hf_hub_download + from transformers import AutoTokenizer from ...utils import logging diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py index 28ee33ab87bc07..de067b0594cad8 100644 --- a/src/transformers/models/realm/tokenization_realm.py +++ b/src/transformers/models/realm/tokenization_realm.py @@ -155,7 +155,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -203,7 +203,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py index f61fa8418ed2ba..4db8b165b96300 100644 --- a/src/transformers/models/realm/tokenization_realm_fast.py +++ b/src/transformers/models/realm/tokenization_realm_fast.py @@ -160,7 +160,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py index d4ffb2a39b055d..af712ced1eed0e 100755 --- a/src/transformers/models/reformer/configuration_reformer.py +++ b/src/transformers/models/reformer/configuration_reformer.py @@ -197,7 +197,7 @@ def __init__( tie_word_embeddings=False, use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): self.hash_seed = hash_seed self.vocab_size = vocab_size diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index af76a55b1feb01..a9ca155bb1b14e 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1226,7 +1226,6 @@ def forward( def _compute_attn_mask( self, query_indices, key_indices, attention_mask, query_key_dots_shape, do_standard_self_attention ): - # chunk attention mask and look before and after if attention_mask is not None: attention_mask = attention_mask.to(torch.uint8)[:, None, :] diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 814d5ed6cde11a..8796c8149c8ae6 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -102,7 +102,7 @@ def __init__( unk_token="", additional_special_tokens=[], sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py index e9c6a61993d09a..4fae5943d72108 100644 --- a/src/transformers/models/reformer/tokenization_reformer_fast.py +++ b/src/transformers/models/reformer/tokenization_reformer_fast.py @@ -98,7 +98,7 @@ def __init__( eos_token="", unk_token="", additional_special_tokens=[], - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py index 13b61c235be06c..201354d1553c34 100644 --- a/src/transformers/models/regnet/configuration_regnet.py +++ b/src/transformers/models/regnet/configuration_regnet.py @@ -78,7 +78,7 @@ def __init__( groups_width=64, layer_type="y", hidden_act="relu", - **kwargs + **kwargs, ): super().__init__(**kwargs) if layer_type not in self.layer_types: diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py index 4a73b9623f113c..22a8a99ca20b03 100644 --- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py +++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py @@ -29,14 +29,14 @@ import torch import torch.nn as nn -from torch import Tensor - from classy_vision.models.regnet import RegNet, RegNetParams from huggingface_hub import cached_download, hf_hub_url +from torch import Tensor +from vissl.models.model_helpers import get_trunk_forward_outputs + from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel from transformers.modeling_utils import PreTrainedModel from transformers.utils import logging -from vissl.models.model_helpers import get_trunk_forward_outputs logging.set_verbosity_info() diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py index acb74dc89dce93..6b34c6aa19c104 100644 --- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py +++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py @@ -22,16 +22,16 @@ from pathlib import Path from typing import Callable, Dict, List, Tuple +import timm import torch import torch.nn as nn -from torch import Tensor - -import timm from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf from huggingface_hub import cached_download, hf_hub_url +from torch import Tensor +from vissl.models.model_helpers import get_trunk_forward_outputs + from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel from transformers.utils import logging -from vissl.models.model_helpers import get_trunk_forward_outputs logging.set_verbosity_info() diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py index 550722569178e9..792a6dbcfadfe7 100644 --- a/src/transformers/models/rembert/configuration_rembert.py +++ b/src/transformers/models/rembert/configuration_rembert.py @@ -119,7 +119,7 @@ def __init__( pad_token_id=0, bos_token_id=312, eos_token_id=313, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index c94c3a491eefec..9a65cb97ab56bf 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -521,7 +521,6 @@ def forward( output_hidden_states: bool = False, return_dict: bool = True, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -536,7 +535,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1015,7 +1013,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING ) class RemBertForCausalLM(RemBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index 46bcfa3458a792..74683dfc0c0884 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -669,7 +669,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py index 4c2cce94aa3ad6..cff101451b4a97 100644 --- a/src/transformers/models/rembert/tokenization_rembert.py +++ b/src/transformers/models/rembert/tokenization_rembert.py @@ -109,7 +109,7 @@ def __init__( pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py index 72c402438f50f2..5d5032f4112838 100644 --- a/src/transformers/models/rembert/tokenization_rembert_fast.py +++ b/src/transformers/models/rembert/tokenization_rembert_fast.py @@ -114,7 +114,7 @@ def __init__( pad_token="", cls_token="[CLS]", mask_token="[MASK]", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py index 74f6c693972298..bf1855340b074f 100644 --- a/src/transformers/models/resnet/configuration_resnet.py +++ b/src/transformers/models/resnet/configuration_resnet.py @@ -89,7 +89,7 @@ def __init__( hidden_act="relu", downsample_in_first_stage=False, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) if layer_type not in self.layer_types: @@ -114,7 +114,6 @@ def __init__( class ResNetOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py index ef3d564185df8c..5f836c9d2a05c7 100644 --- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py +++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py @@ -22,12 +22,12 @@ from pathlib import Path from typing import List +import timm import torch import torch.nn as nn +from huggingface_hub import hf_hub_download from torch import Tensor -import timm -from huggingface_hub import hf_hub_download from transformers import AutoFeatureExtractor, ResNetConfig, ResNetForImageClassification from transformers.utils import logging diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py index 9f02b46f8baffc..bb6035adf2df64 100644 --- a/src/transformers/models/resnet/modeling_tf_resnet.py +++ b/src/transformers/models/resnet/modeling_tf_resnet.py @@ -171,7 +171,7 @@ def __init__( stride: int = 1, activation: str = "relu", reduction: int = 4, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) should_apply_shortcut = in_channels != out_channels or stride != 1 diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/retribert/configuration_retribert.py index 23172cf40ec7d3..33663ad6167f9f 100644 --- a/src/transformers/models/retribert/configuration_retribert.py +++ b/src/transformers/models/retribert/configuration_retribert.py @@ -91,7 +91,7 @@ def __init__( share_encoders=True, projection_dim=128, pad_token_id=0, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py index a15cbcdf74bfda..0c04c363ebe0a0 100644 --- a/src/transformers/models/retribert/tokenization_retribert.py +++ b/src/transformers/models/retribert/tokenization_retribert.py @@ -68,7 +68,6 @@ def whitespace_tokenize(text): class RetriBertTokenizer(PreTrainedTokenizer): - r""" Constructs a RetriBERT tokenizer. @@ -131,7 +130,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -183,7 +182,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/retribert/tokenization_retribert_fast.py index 2532f839a30090..c242213e1faedb 100644 --- a/src/transformers/models/retribert/tokenization_retribert_fast.py +++ b/src/transformers/models/retribert/tokenization_retribert_fast.py @@ -114,7 +114,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py index 3d911f5847dc76..3025fe2833d904 100644 --- a/src/transformers/models/roberta/configuration_roberta.py +++ b/src/transformers/models/roberta/configuration_roberta.py @@ -123,7 +123,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py index bfb6ea365adcf6..5de7375d383843 100644 --- a/src/transformers/models/roberta/modeling_flax_roberta.py +++ b/src/transformers/models/roberta/modeling_flax_roberta.py @@ -14,11 +14,10 @@ # limitations under the License. from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -739,7 +738,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 884773e641de1a..2b6d47b42036c4 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -501,7 +501,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index c38de45e5629e9..606afb754b4ffe 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -633,7 +633,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index 10b28125e92bce..d291a2f9d97a08 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -198,7 +198,7 @@ def __init__( pad_token="", mask_token="", add_prefix_space=False, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index 29381404c47fe9..49311b3aeff9ab 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -173,7 +173,7 @@ def __init__( mask_token="", add_prefix_space=False, trim_offsets=True, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py index 1683e527aa8776..49f92586c1b732 100644 --- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py @@ -124,7 +124,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py index 463f8d58a61eaf..41fd14c5fddff2 100644 --- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py @@ -18,8 +18,8 @@ import argparse import torch - from huggingface_hub import hf_hub_download + from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM from transformers.utils import logging diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py index dc9ef03afdc7e8..8f5dc7944c1614 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py @@ -15,11 +15,10 @@ """ Flax RoBERTa-PreLayerNorm model.""" from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -742,7 +741,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 037f40fb6d7d78..e5fbb6e3413c1b 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -503,7 +503,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py index d5808a82542b19..1843605bd04a26 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py @@ -634,7 +634,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py index 63f3dbc49eacf0..2f0a0dd0e0f7ac 100644 --- a/src/transformers/models/roc_bert/configuration_roc_bert.py +++ b/src/transformers/models/roc_bert/configuration_roc_bert.py @@ -136,7 +136,7 @@ def __init__( shape_embed_dim=512, shape_vocab_size=24858, concat_input=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 11b72170c44981..ac3d374a556c7a 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -635,7 +635,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1551,7 +1550,7 @@ def prepare_inputs_for_generation( input_pronunciation_ids=None, past_key_values=None, attention_mask=None, - **model_kwargs + **model_kwargs, ): input_shape = input_ids.shape diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py index 07e740577a06c3..4338c098ba79f6 100644 --- a/src/transformers/models/roc_bert/tokenization_roc_bert.py +++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py @@ -154,7 +154,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -214,7 +214,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) @@ -243,7 +242,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: def get_input_ids(text): if isinstance(text, str): @@ -342,7 +341,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It @@ -584,7 +583,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: def get_input_ids(text): if isinstance(text, str): diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py index cbd92b412ba385..8a9e2dba958398 100644 --- a/src/transformers/models/roformer/configuration_roformer.py +++ b/src/transformers/models/roformer/configuration_roformer.py @@ -126,7 +126,7 @@ def __init__( pad_token_id=0, rotary_value=False, use_cache=True, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py index d18640d29b5760..d95a4d73832e9a 100644 --- a/src/transformers/models/roformer/modeling_flax_roformer.py +++ b/src/transformers/models/roformer/modeling_flax_roformer.py @@ -16,11 +16,10 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen.attention import dot_product_attention_weights from flax.traverse_util import flatten_dict, unflatten_dict @@ -622,7 +621,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 33a85deac9ba76..3631b9704f6ec6 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -572,7 +572,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 018595d64baa27..952250e68a04fb 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -623,7 +623,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py index 5ab1f694ad9a0f..6c0b6cd4f3c281 100644 --- a/src/transformers/models/roformer/tokenization_roformer.py +++ b/src/transformers/models/roformer/tokenization_roformer.py @@ -364,7 +364,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index 7b2cab56886200..88ccf183d17462 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -106,7 +106,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py index 994ecba2097d15..44a835c4b06cab 100644 --- a/src/transformers/models/segformer/configuration_segformer.py +++ b/src/transformers/models/segformer/configuration_segformer.py @@ -122,7 +122,7 @@ def __init__( layer_norm_eps=1e-6, decoder_hidden_size=256, semantic_loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -155,7 +155,6 @@ def __init__( class SegformerOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py index 00dddc9974a953..024a8699b01e8b 100644 --- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py +++ b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py @@ -20,11 +20,11 @@ from collections import OrderedDict from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( SegformerConfig, SegformerFeatureExtractor, diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py index f52e7d0d00b4db..1920c6668b0c19 100644 --- a/src/transformers/models/segformer/image_processing_segformer.py +++ b/src/transformers/models/segformer/image_processing_segformer.py @@ -96,7 +96,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_reduce_labels: bool = False, - **kwargs + **kwargs, ) -> None: if "reduce_labels" in kwargs: warnings.warn( @@ -146,7 +146,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. @@ -173,7 +173,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -194,7 +194,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py index 4ac9a87406497e..c877e86acfab4c 100644 --- a/src/transformers/models/segformer/modeling_tf_segformer.py +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -107,7 +107,7 @@ def __init__( hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.hidden_size = hidden_size @@ -213,7 +213,7 @@ def __init__( hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.self = TFSegformerEfficientSelfAttention( @@ -262,7 +262,7 @@ def __init__( in_features: int, hidden_features: int = None, out_features: int = None, - **kwargs + **kwargs, ): super().__init__(**kwargs) out_features = out_features or in_features @@ -296,7 +296,7 @@ def __init__( drop_path: float, sequence_reduction_ratio: int, mlp_ratio: int, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1") diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index 1c69818fa156b5..af7041843de3a9 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -190,7 +190,7 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py index 58c0338a850d0f..81c3284af8ef6e 100644 --- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py @@ -25,6 +25,7 @@ # Register SEW's fairseq modules from sew_asapp import tasks # noqa: F401 + from transformers import ( SEWConfig, SEWForCTC, diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 3f308891ec2b49..12ab9232ac8d82 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -1053,7 +1053,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py index 845d3698e20e96..adf2ff04b8d6d8 100644 --- a/src/transformers/models/sew_d/configuration_sew_d.py +++ b/src/transformers/models/sew_d/configuration_sew_d.py @@ -214,7 +214,7 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py index 942add470b9c68..7844d7912f2c8b 100644 --- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py @@ -25,6 +25,7 @@ # Register SEW's fairseq modules from sew_asapp import tasks # noqa: F401 + from transformers import ( SEWDConfig, SEWDForCTC, diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index b25d7be0c7ebed..9ddfc7821f4088 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1132,7 +1132,6 @@ def forward( rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (output_states,) @@ -1594,7 +1593,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py index 8ecf0967635708..d7e7bdf57fc913 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py @@ -345,9 +345,8 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): - if not _do_init: raise ValueError( "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`." @@ -616,7 +615,6 @@ def decode( def _decoder_forward( module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs ): - projection_module = module._get_projection_module() decoder_module = module._get_decoder_module() @@ -750,7 +748,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape @@ -787,7 +785,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, *model_args, - **kwargs + **kwargs, ) -> FlaxPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index dab31b947051be..0bc4134d4b554a 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -290,7 +290,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, - **kwargs + **kwargs, ) -> PreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py index 7c22e20e4d49b0..8bad1972e09215 100644 --- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py +++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py @@ -144,7 +144,7 @@ def __init__( conv_channels=1024, input_feat_per_channel=80, input_channels=1, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 1492ce942f86bb..a5e6b0d4004264 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -68,7 +68,7 @@ def __init__( do_ceptral_normalize=True, normalize_means=True, normalize_vars=True, - **kwargs + **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) self.num_mel_bins = num_mel_bins @@ -133,7 +133,7 @@ def __call__( return_tensors: Optional[Union[str, TensorType]] = None, sampling_rate: Optional[int] = None, return_attention_mask: Optional[bool] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index e1c8d467e3160f..e68009a3ae15a6 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1048,7 +1048,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" @@ -1074,7 +1073,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1402,7 +1400,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 1994417d7b71a5..85333f4e0dd896 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1168,7 +1168,7 @@ def call( output_hidden_states=None, return_dict=None, training=False, - **kwargs + **kwargs, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1279,7 +1279,7 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: bool = False, - **kwargs + **kwargs, ) -> Union[Tuple, TFSeq2SeqModelOutput]: outputs = self.model( input_features=input_features, @@ -1370,7 +1370,7 @@ def call( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, training: Optional[bool] = False, - **kwargs + **kwargs, ) -> Union[Tuple, TFSeq2SeqLMOutput]: r""" labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1484,7 +1484,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py index fddbc8c7afd47f..596f6bea0bbce9 100644 --- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py @@ -110,7 +110,7 @@ def __init__( bos_token_id=0, eos_token_id=2, max_target_positions=1024, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index f0452e4df1a619..8d5c508b9808c3 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -657,7 +657,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" @@ -683,7 +682,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py index 3365dfe382ae6f..4c90ba05ba31c9 100644 --- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py @@ -108,7 +108,7 @@ def __init__( unk_token="", do_lower_case=False, merges_file=None, - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py index c36d6e22faf938..9385e44dc664ea 100644 --- a/src/transformers/models/speecht5/configuration_speecht5.py +++ b/src/transformers/models/speecht5/configuration_speecht5.py @@ -246,7 +246,7 @@ def __init__( decoder_max_relative_position=160, use_cache=True, is_encoder_decoder=True, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.hidden_size = hidden_size diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py index 4f08b1cf577940..7b4f2af17367ef 100644 --- a/src/transformers/models/speecht5/feature_extraction_speecht5.py +++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py @@ -91,7 +91,7 @@ def __init__( mel_floor: float = 1e-10, reduction_factor: int = 2, return_attention_mask: bool = True, - **kwargs + **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) self.do_normalize = do_normalize @@ -242,7 +242,7 @@ def __call__( return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, sampling_rate: Optional[int] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). @@ -358,7 +358,7 @@ def _process_audio( pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchFeature: is_batched = bool( isinstance(speech, (list, tuple)) diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index f4d51d665f4e9a..d9c381fdc86d14 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -1440,7 +1440,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: - hidden_states, attention_mask = self.prenet(input_values, attention_mask) outputs = self.wrapped_encoder( @@ -1484,7 +1483,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: - hidden_states = self.prenet(input_values) outputs = self.wrapped_encoder( @@ -1522,7 +1520,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: - return self.wrapped_encoder( hidden_states=input_values, attention_mask=attention_mask, @@ -1792,7 +1789,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - decoder_hidden_states = self.prenet(input_values, speaker_embeddings) outputs = self.wrapped_decoder( @@ -1846,7 +1842,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values) outputs = self.wrapped_decoder( @@ -1894,7 +1889,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: - outputs = self.wrapped_decoder( hidden_states=input_values, attention_mask=attention_mask, @@ -2381,7 +2375,7 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past is not None: diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py index 4b641558b7c0f1..a0b933f305610f 100644 --- a/src/transformers/models/speecht5/tokenization_speecht5.py +++ b/src/transformers/models/speecht5/tokenization_speecht5.py @@ -98,7 +98,7 @@ def __init__( unk_token="", pad_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py index 60b2580bec1aff..bdbe5f013143a6 100644 --- a/src/transformers/models/splinter/configuration_splinter.py +++ b/src/transformers/models/splinter/configuration_splinter.py @@ -107,7 +107,7 @@ def __init__( use_cache=True, pad_token_id=0, question_token_id=104, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 954e0aa356d524..c44a9886640899 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -450,7 +450,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py index 40daeb09465ad1..308680940db106 100644 --- a/src/transformers/models/splinter/tokenization_splinter.py +++ b/src/transformers/models/splinter/tokenization_splinter.py @@ -135,7 +135,7 @@ def __init__( question_token="[QUESTION]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -192,7 +192,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/splinter/tokenization_splinter_fast.py b/src/transformers/models/splinter/tokenization_splinter_fast.py index 6eb69755905ace..97db72caadc05c 100644 --- a/src/transformers/models/splinter/tokenization_splinter_fast.py +++ b/src/transformers/models/splinter/tokenization_splinter_fast.py @@ -113,7 +113,7 @@ def __init__( question_token="[QUESTION]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py index 639be83a6c5ab3..5757b9410fce40 100644 --- a/src/transformers/models/squeezebert/configuration_squeezebert.py +++ b/src/transformers/models/squeezebert/configuration_squeezebert.py @@ -134,7 +134,7 @@ def __init__( post_attention_groups=1, intermediate_groups=4, output_groups=4, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index a74ad5897b2adb..4f666bb7f8f098 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -315,7 +315,6 @@ def forward( output_hidden_states=False, return_dict=True, ): - if head_mask is None: head_mask_is_all_none = True elif head_mask.count(None) == len(head_mask): @@ -331,7 +330,6 @@ def forward( all_attentions = () if output_attentions else None for layer in self.layers: - if output_hidden_states: hidden_states = hidden_states.permute(0, 2, 1) all_hidden_states += (hidden_states,) @@ -645,7 +643,6 @@ def forward( @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING) class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [ r"predictions.decoder.bias", "cls.predictions.decoder.weight", diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py index 00d450058238fd..ed7be941e0363a 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py @@ -136,7 +136,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( do_lower_case=do_lower_case, @@ -184,7 +184,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py index 0423c16fc33125..bf7659ffd18b4b 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py @@ -124,7 +124,7 @@ def __init__( mask_token="[MASK]", tokenize_chinese_chars=True, strip_accents=None, - **kwargs + **kwargs, ): super().__init__( vocab_file, diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py index 89c9d556b287d3..3eb21d5663bc24 100644 --- a/src/transformers/models/swin/configuration_swin.py +++ b/src/transformers/models/swin/configuration_swin.py @@ -129,7 +129,7 @@ def __init__( layer_norm_eps=1e-5, encoder_stride=32, out_features=None, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -168,7 +168,6 @@ def __init__( class SwinOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py index 302bd6f3f7d1ad..72e6bce5b09e5b 100644 --- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py +++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py @@ -18,10 +18,10 @@ import argparse +import requests import torch from PIL import Image -import requests from transformers import SwinConfig, SwinForMaskedImageModeling, ViTFeatureExtractor diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py index 860fdd1b54d2af..c5734757e60188 100644 --- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py +++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py @@ -1,12 +1,12 @@ import argparse import json -import torch -from PIL import Image - import requests import timm +import torch from huggingface_hub import hf_hub_download +from PIL import Image + from transformers import AutoFeatureExtractor, SwinConfig, SwinForImageClassification diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 7b48cdc84d2ff2..abf47cf83135a8 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -741,7 +741,6 @@ def forward( ) -> Tuple[torch.Tensor]: height, width = input_dimensions for i, layer_module in enumerate(self.blocks): - layer_head_mask = head_mask[i] if head_mask is not None else None layer_outputs = layer_module( diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index 46155a7d73ad20..1022990280a98f 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -788,7 +788,7 @@ def __init__( num_heads: int, drop_path: List[float], downsample: Optional[Callable], - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) self.config = config diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py index 4547b5848a1ba2..79b1dda68eca73 100644 --- a/src/transformers/models/swin2sr/configuration_swin2sr.py +++ b/src/transformers/models/swin2sr/configuration_swin2sr.py @@ -128,7 +128,7 @@ def __init__( img_range=1.0, resi_connection="1conv", upsampler="pixelshuffle", - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py index 38a11496f7ee46..6884bf0afc0cde 100644 --- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py +++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py @@ -16,11 +16,11 @@ import argparse +import requests import torch from PIL import Image from torchvision.transforms import Compose, Normalize, Resize, ToTensor -import requests from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py index 62ec9db16c489d..24ee846e45976b 100644 --- a/src/transformers/models/swin2sr/image_processing_swin2sr.py +++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py @@ -50,7 +50,7 @@ def __init__( rescale_factor: Union[int, float] = 1 / 255, do_pad: bool = True, pad_size: int = 8, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py index 46d943cbe924e1..4859ccd51eb5b4 100644 --- a/src/transformers/models/swinv2/configuration_swinv2.py +++ b/src/transformers/models/swinv2/configuration_swinv2.py @@ -118,7 +118,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-5, encoder_stride=32, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py index 7af3bfb86c1741..ba70e707a949d8 100644 --- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py +++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py @@ -18,12 +18,12 @@ import json from pathlib import Path -import torch -from PIL import Image - import requests import timm +import torch from huggingface_hub import hf_hub_download +from PIL import Image + from transformers import AutoFeatureExtractor, Swinv2Config, Swinv2ForImageClassification diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index e46decde4ca038..3104e5d2d20ef7 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -815,7 +815,6 @@ def forward( ) -> Tuple[torch.Tensor]: height, width = input_dimensions for i, layer_module in enumerate(self.blocks): - layer_head_mask = head_mask[i] if head_mask is not None else None layer_outputs = layer_module( diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py index 0d84d7ee33ffaa..dd6c6c03e2cfae 100644 --- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py +++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py @@ -129,7 +129,7 @@ def __init__( use_cache=True, pad_token_id=0, eos_token_id=1, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py index aa44f9a2190d9d..86c673b48a4ede 100644 --- a/src/transformers/models/switch_transformers/convert_big_switch.py +++ b/src/transformers/models/switch_transformers/convert_big_switch.py @@ -2,12 +2,12 @@ import json import os -import torch -from tensorflow.io import gfile - import tensorstore as ts +import torch from flax import serialization from flax.traverse_util import flatten_dict, unflatten_dict +from tensorflow.io import gfile + from transformers.modeling_utils import dtype_byte_size from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import ( rename_keys, diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py index 45cd63e4743336..5937101169c6b4 100644 --- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py +++ b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py @@ -20,6 +20,7 @@ from flax.traverse_util import flatten_dict, unflatten_dict from t5x import checkpoints + from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model from transformers.utils import logging @@ -92,7 +93,6 @@ def rename_keys(s_dict): # 3. Take extra care of the EXPERTS layer for key in list(s_dict.keys()): if "expert" in key: - num_experts = s_dict[key].shape[0] expert_weihts = s_dict[key] for idx in range(num_experts): diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index b19016bfab8bdd..77fe3be9c350e5 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -241,7 +241,6 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): - # SwitchTransformers uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -334,7 +333,6 @@ def forward(self, hidden_states): next_states = hidden_states.clone() for idx, expert in enumerate(self.experts.values()): - token_indices = router_mask[:, :, idx].bool() next_states[token_indices] = expert(hidden_states[token_indices]) @@ -721,7 +719,6 @@ def forward( output_router_logits=True, return_dict=True, ): - if past_key_value is not None: if not self.is_decoder: logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") @@ -939,7 +936,6 @@ def __init__(self, config, embed_tokens=None): config.num_layers = config.num_decoder_layers if self.is_decoder else config.num_layers self.block = nn.ModuleList() for i in range(config.num_layers): - is_sparse = (i % sparse_step == 1) if sparse_step > 0 else False self.block.append( @@ -1758,9 +1754,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past is used if past_key_values is not None: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py index a2bd03dfd74cc8..1ad7a51412c6da 100644 --- a/src/transformers/models/t5/configuration_t5.py +++ b/src/transformers/models/t5/configuration_t5.py @@ -98,7 +98,7 @@ def __init__( use_cache=True, pad_token_id=0, eos_token_id=1, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py index 68e66c8298238f..11f32c8461e97c 100644 --- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py +++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py @@ -18,6 +18,7 @@ import argparse from t5x import checkpoints + from transformers import FlaxT5ForConditionalGeneration, T5Config diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py index f58cee4ae8269f..aebe5d88fca365 100755 --- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py +++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py @@ -32,9 +32,9 @@ import collections import torch - from flax import traverse_util from t5x import checkpoints + from transformers import T5Config, T5EncoderModel, T5ForConditionalGeneration from transformers.utils import logging diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py index 1006458fbd5a92..249d4913e010ef 100644 --- a/src/transformers/models/t5/modeling_flax_t5.py +++ b/src/transformers/models/t5/modeling_flax_t5.py @@ -18,11 +18,10 @@ import copy from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -943,7 +942,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -1439,7 +1438,6 @@ def __call__( return_dict: bool = True, deterministic: bool = True, ): - # Encode if needed (training, first prediction pass) encoder_outputs = self.encoder( input_ids=input_ids, @@ -1745,7 +1743,7 @@ def prepare_inputs_for_generation( attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index f329f32d42d249..49adf4c421af34 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -245,7 +245,6 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): - # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for @@ -666,7 +665,6 @@ def forward( output_attentions=False, return_dict=True, ): - if past_key_value is not None: if not self.is_decoder: logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.") @@ -1722,9 +1720,8 @@ def prepare_inputs_for_generation( cross_attn_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past is used if past_key_values is not None: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 0e420a85d4deca..f9996e15314e2b 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -547,7 +547,6 @@ def call( output_attentions=False, training=False, ): - if past_key_value is not None: assert self.is_decoder, "Only decoder can use `past_key_values`" expected_num_past_key_values = 2 if encoder_hidden_states is None else 4 @@ -663,7 +662,6 @@ def call( return_dict=None, training=False, ) -> Tuple: - if input_ids is not None and inputs_embeds is not None: err_msg_prefix = "decoder_" if self.is_decoder else "" raise ValueError( @@ -1505,9 +1503,8 @@ def prepare_inputs_for_generation( decoder_head_mask=None, use_cache=None, encoder_outputs=None, - **kwargs + **kwargs, ): - # cut decoder_input_ids if past is used if past_key_values is not None: input_ids = input_ids[:, -1:] diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 4bdd3a907710c1..400c956a3d5771 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -120,7 +120,7 @@ def __init__( extra_ids=100, additional_special_tokens=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 215df486786de0..589a346ed019bd 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -115,7 +115,7 @@ def __init__( pad_token="", extra_ids=100, additional_special_tokens=None, - **kwargs + **kwargs, ): # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py index bc7bd6034e4efe..d74424ce69fe3a 100644 --- a/src/transformers/models/table_transformer/configuration_table_transformer.py +++ b/src/transformers/models/table_transformer/configuration_table_transformer.py @@ -176,7 +176,7 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, eos_coefficient=0.1, - **kwargs + **kwargs, ): if backbone_config is not None and use_timm_backbone: raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.") @@ -238,7 +238,6 @@ def hidden_size(self) -> int: # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig class TableTransformerOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py index a53bd9e03d80d7..1973fe82e9d550 100644 --- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -23,10 +23,10 @@ from pathlib import Path import torch +from huggingface_hub import hf_hub_download from PIL import Image from torchvision.transforms import functional as F -from huggingface_hub import hf_hub_download from transformers import DetrFeatureExtractor, TableTransformerConfig, TableTransformerForObjectDetection from transformers.utils import logging diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py index 71fd5715ef57fb..f466ab42545f04 100644 --- a/src/transformers/models/tapas/configuration_tapas.py +++ b/src/transformers/models/tapas/configuration_tapas.py @@ -193,9 +193,8 @@ def __init__( disable_per_token_loss=False, aggregation_labels=None, no_aggregation_label_index=None, - **kwargs + **kwargs, ): - super().__init__(pad_token_id=pad_token_id, **kwargs) # BERT hyperparameters (with updated max_position_embeddings and type_vocab_sizes) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 55243f01fcdaba..83a3f9fefc9860 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -289,7 +289,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs position_ids = position_ids.unsqueeze(0).expand(input_shape) # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings if self.config.reset_position_index_per_cell: - # shape (batch_size, seq_len) col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1) # shape (batch_size, seq_len) @@ -1023,7 +1022,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - **kwargs + **kwargs, ) -> Union[Tuple, MaskedLMOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index 1d25e04dd80df8..5c995aa93014a2 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -216,7 +216,6 @@ def call( position_ids = tf.broadcast_to(position_ids, shape=input_shape) # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings if self.reset_position_index_per_cell: - # shape (batch_size, seq_len) col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1) # shape (batch_size, seq_len) @@ -779,7 +778,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index d3c3d934dbd9e4..395ec876c9f410 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -340,7 +340,7 @@ def __init__( max_question_length=None, model_max_length: int = 512, additional_special_tokens: Optional[List[str]] = None, - **kwargs + **kwargs, ): if not is_pandas_available(): raise ImportError("Pandas is required for the TAPAS tokenizer.") @@ -418,7 +418,6 @@ def _tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - # If the token is part of the never_split set if token in self.basic_tokenizer.never_split: split_tokens.append(token) @@ -602,7 +601,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) related to a table. @@ -716,7 +715,7 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepare a table and a list of strings for the model. @@ -823,7 +822,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: table_tokens = self._tokenize_table(table) @@ -882,7 +881,7 @@ def _batch_prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: batch_outputs = {} @@ -944,7 +943,7 @@ def encode( truncation: Union[bool, str, TapasTruncationStrategy] = False, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> List[int]: """ Prepare a table and a string for the model. This method does not return token type IDs, attention masks, etc. @@ -996,7 +995,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepare a table and a string for the model. @@ -1077,7 +1076,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ): if query is None: query = "" @@ -1136,7 +1135,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence of input id so that it can be used by the model. It adds special tokens, truncates diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py index c91c9b16ae7abc..c41c6cbe47ae94 100644 --- a/src/transformers/models/tapex/tokenization_tapex.py +++ b/src/transformers/models/tapex/tokenization_tapex.py @@ -284,7 +284,7 @@ def __init__( mask_token="", add_prefix_space=False, max_cell_length=15, - **kwargs + **kwargs, ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token @@ -531,7 +531,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several table-sequence pair(s). @@ -608,7 +608,7 @@ def source_call_func( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # Input type checking for clearer error valid_table = False @@ -695,7 +695,7 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ @@ -753,9 +753,8 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - if return_offsets_mapping: raise NotImplementedError( "return_offset_mapping is not available when using Python tokenizers. " @@ -875,7 +874,7 @@ def encode( truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> List[int]: """ Prepare a table, a string and possible answer for the model. This method does not return token type IDs, @@ -914,7 +913,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( @@ -964,7 +963,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( @@ -1020,7 +1019,7 @@ def target_call_func( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ The method tokenizes and prepares the answer label for the model. @@ -1084,7 +1083,7 @@ def target_batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepare answer strings for the model. @@ -1138,12 +1137,10 @@ def _target_batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - batch_outputs = {} for text in answer: - if self.do_lower_case: text = text.lower() @@ -1191,7 +1188,7 @@ def target_encode( truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None, max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> List[int]: """ Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc. @@ -1229,7 +1226,7 @@ def target_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepare a answer string for the model. @@ -1282,7 +1279,7 @@ def _target_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: if return_offsets_mapping: raise NotImplementedError( diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py index 8d89d5cd7f19f1..7066d1a2d84c9e 100644 --- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py @@ -161,7 +161,7 @@ def __init__( num_parallel_samples: int = 100, init_std: float = 0.02, use_cache=True, - **kwargs + **kwargs, ): # time series specific configuration self.prediction_length = prediction_length diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index e9f412c1bfec78..acc64332a76893 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -1347,7 +1347,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1372,7 +1371,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py index d3db62e53d25f5..2e30a8c5dff268 100644 --- a/src/transformers/models/timesformer/configuration_timesformer.py +++ b/src/transformers/models/timesformer/configuration_timesformer.py @@ -104,7 +104,7 @@ def __init__( qkv_bias=True, attention_type="divided_space_time", drop_path_rate=0, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py index ca58994db82836..5a66776d4381a3 100644 --- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py +++ b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py @@ -17,11 +17,11 @@ import argparse import json +import gdown import numpy as np import torch - -import gdown from huggingface_hub import hf_hub_download + from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEFeatureExtractor diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index 611d854262fd93..b5a525127b3def 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -235,7 +235,6 @@ def __init__(self, config: TimesformerConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -274,7 +273,6 @@ def __init__(self, config: TimesformerConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.dropout(hidden_states) diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py index 4042d79ea5f6f8..875980fde19e7b 100644 --- a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py +++ b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py @@ -133,7 +133,7 @@ def __init__( pad_token_id=1, bos_token_id=50256, eos_token_id=50256, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.action_weight = action_weight diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py index 14e6556e07b7a1..622552fa783608 100644 --- a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py @@ -15,8 +15,8 @@ """ TrajectoryTransformer pytorch checkpoint conversion""" import torch - import trajectory.utils as utils + from transformers import TrajectoryTransformerModel diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py index cf41166b9390a9..fee99ce4e56350 100644 --- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py +++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py @@ -538,7 +538,6 @@ def forward( all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)): - if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/transfo_xl/configuration_transfo_xl.py index c9b8464b1039de..8550e71802867a 100644 --- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py +++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py @@ -141,7 +141,7 @@ def __init__( init_std=0.02, layer_norm_epsilon=1e-5, eos_token_id=0, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.cutoffs = [] diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py index f2c4653b7298dd..93af2165111288 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py @@ -128,7 +128,7 @@ def __init__( layer_norm_epsilon=1e-5, init_std=0.02, output_attentions=False, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -281,7 +281,7 @@ def __init__( layer_norm_epsilon=1e-5, init_std=0.02, output_attentions=False, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -551,7 +551,6 @@ def call( labels: Optional[Union[np.ndarray, tf.Tensor]] = None, training: bool = False, ): - # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index fb0157dd6f1b09..094b2d33f6855c 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -381,7 +381,6 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon ) def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False): - attn_outputs = self.dec_attn( dec_inp, r, @@ -860,7 +859,6 @@ def _update_mems(self, hids, mems, mlen, qlen): end_idx = mlen + max(0, qlen) beg_idx = max(0, end_idx - self.mem_len) for i in range(len(hids)): - cat = torch.cat([mems[i], hids[i]], dim=0) new_mems.append(cat[beg_idx:end_idx].detach()) diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py index 5b284a219a4753..13977d43828051 100644 --- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py @@ -179,7 +179,7 @@ def __init__( eos_token="", additional_special_tokens=[""], language="en", - **kwargs + **kwargs, ): super().__init__( special=special, @@ -643,7 +643,6 @@ def __iter__(self): class LMMultiFileIterator(LMShuffledIterator): def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False): - self.paths = paths self.vocab = vocab diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py index 901370c31eeb32..b3f03373618484 100644 --- a/src/transformers/models/trocr/configuration_trocr.py +++ b/src/transformers/models/trocr/configuration_trocr.py @@ -121,7 +121,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.d_model = d_model diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py index 997fd747621094..26291296817bb2 100644 --- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py +++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py @@ -18,10 +18,10 @@ import argparse from pathlib import Path +import requests import torch from PIL import Image -import requests from transformers import ( RobertaTokenizer, TrOCRConfig, diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index df7f6c569f474e..98ab48a938c5ed 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -689,7 +689,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" @@ -715,7 +714,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index d041075c9b626e..f4e8df659e9a01 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -226,7 +226,7 @@ def __init__( bos_token_id=1, eos_token_id=2, replace_prob=0.5, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index af7dc87e673ed9..d2866652377196 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1423,7 +1423,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index 3205bbc2cca805..222f982fe769bb 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -239,7 +239,7 @@ def __init__( bos_token_id=1, eos_token_id=2, num_clusters=504, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 7ce1b4465b0ddb..1b197108ef43de 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1430,7 +1430,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1607,8 +1606,7 @@ def __init__(self, config): if hasattr(config, "add_adapter") and config.add_adapter: raise ValueError( - "Audio frame classification does not support the use of UniSpeechSat adapters" - " (config.add_adapter=True)" + "Audio frame classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)" ) self.unispeech_sat = UniSpeechSatModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py index 449e69def3e35b..593e8953a27358 100644 --- a/src/transformers/models/upernet/configuration_upernet.py +++ b/src/transformers/models/upernet/configuration_upernet.py @@ -85,7 +85,7 @@ def __init__( auxiliary_num_convs=1, auxiliary_concat_input=False, loss_ignore_index=255, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py index 0b7b6e11b11d91..eeb3ab5fc99381 100644 --- a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py +++ b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py @@ -17,11 +17,11 @@ import argparse import json +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py index a44549a4703cbc..9580af7c46a50c 100644 --- a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py +++ b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py @@ -20,11 +20,11 @@ import argparse import json +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation diff --git a/src/transformers/models/van/configuration_van.py b/src/transformers/models/van/configuration_van.py index 47d5a9b6c11aa1..85a0dd20e47728 100644 --- a/src/transformers/models/van/configuration_van.py +++ b/src/transformers/models/van/configuration_van.py @@ -94,7 +94,7 @@ def __init__( layer_scale_init_value=1e-2, drop_path_rate=0.0, dropout_rate=0.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.image_size = image_size diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/van/convert_van_to_pytorch.py index ded3c3500dad24..a8086e6d1b511b 100644 --- a/src/transformers/models/van/convert_van_to_pytorch.py +++ b/src/transformers/models/van/convert_van_to_pytorch.py @@ -27,9 +27,9 @@ import torch import torch.nn as nn +from huggingface_hub import cached_download, hf_hub_download from torch import Tensor -from huggingface_hub import cached_download, hf_hub_download from transformers import AutoFeatureExtractor, VanConfig, VanForImageClassification from transformers.models.van.modeling_van import VanLayerScaling from transformers.utils import logging diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py index 932c4c1d98cabf..8120bb23fc2a6c 100644 --- a/src/transformers/models/videomae/configuration_videomae.py +++ b/src/transformers/models/videomae/configuration_videomae.py @@ -119,7 +119,7 @@ def __init__( decoder_num_hidden_layers=4, decoder_intermediate_size=1536, norm_pix_loss=True, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py index 2f4ce5d44704a7..68f6372ab53ed6 100644 --- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py +++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py @@ -17,11 +17,11 @@ import argparse import json +import gdown import numpy as np import torch - -import gdown from huggingface_hub import hf_hub_download + from transformers import ( VideoMAEConfig, VideoMAEFeatureExtractor, diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py index 451d2461c344c7..e3edfdf00d27a0 100644 --- a/src/transformers/models/videomae/image_processing_videomae.py +++ b/src/transformers/models/videomae/image_processing_videomae.py @@ -115,7 +115,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -140,7 +140,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -171,7 +171,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any @@ -195,7 +195,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -216,7 +216,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index 8831f65a49b19c..ee166317909e9e 100644 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -106,6 +106,7 @@ class VideoMAEForPreTrainingOutput(ModelOutput): # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31 def get_sinusoid_encoding_table(n_position, d_hid): """Sinusoid position encoding table""" + # TODO: make it with torch instead of numpy def get_position_angle_vec(position): return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] @@ -236,7 +237,6 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: def forward( self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias) values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias) @@ -286,7 +286,6 @@ def __init__(self, config: VideoMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -344,7 +343,6 @@ def __init__(self, config: VideoMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py index 7a5c856413a08e..3c5935f1f1b22a 100644 --- a/src/transformers/models/vilt/configuration_vilt.py +++ b/src/transformers/models/vilt/configuration_vilt.py @@ -120,7 +120,7 @@ def __init__( max_image_length=-1, tie_word_embeddings=False, num_images=-1, - **kwargs + **kwargs, ): super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py index 5e737f784c81b2..693ca34efcf6e3 100644 --- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py +++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ( BertTokenizer, ViltConfig, diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py index 9e50c1f7d58f6b..783197c7e956b3 100644 --- a/src/transformers/models/vilt/image_processing_vilt.py +++ b/src/transformers/models/vilt/image_processing_vilt.py @@ -165,7 +165,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -204,7 +204,7 @@ def resize( size_divisor: int = 32, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. @@ -238,7 +238,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -259,7 +259,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 0be05ea4aa1e93..61cc69b694bf09 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -399,7 +399,6 @@ def __init__(self, config: ViltConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -451,7 +450,6 @@ def __init__(self, config: ViltConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) @@ -1419,7 +1417,6 @@ def forward( VILT_START_DOCSTRING, ) class ViltForTokenClassification(ViltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py index 2578724066bec2..4fd8d9188faa49 100644 --- a/src/transformers/models/vilt/processing_vilt.py +++ b/src/transformers/models/vilt/processing_vilt.py @@ -77,7 +77,7 @@ def __call__( return_length: bool = False, verbose: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> BatchEncoding: """ This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py index 0bdc6366867c2c..8561875ed59957 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py @@ -284,7 +284,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): if not _do_init: raise ValueError( @@ -553,7 +553,6 @@ def decode( def _decoder_forward( module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs ): - projection_module = module._get_projection_module() decoder_module = module._get_decoder_module() @@ -691,7 +690,7 @@ def prepare_inputs_for_generation( max_length, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, - **kwargs + **kwargs, ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape @@ -727,7 +726,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, *model_args, - **kwargs + **kwargs, ) -> FlaxPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py index 88d9b83bea6bab..b3d3c2b5707e31 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py @@ -147,7 +147,6 @@ # Copied from transformers.models.encoder_decoder.modeling_tf_encoder_decoder.shift_tokens_right def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): - if pad_token_id is None: raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.") pad_token_id = tf.cast(pad_token_id, input_ids.dtype) @@ -368,7 +367,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, - **kwargs + **kwargs, ) -> TFPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model @@ -603,7 +602,6 @@ def call( ) if encoder_outputs is None: - encoder_inputs = { "input_ids": pixel_values, "output_attentions": output_attentions, diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py index 91c08901ce0014..34ead714658c2f 100644 --- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py @@ -368,7 +368,7 @@ def from_encoder_decoder_pretrained( encoder_pretrained_model_name_or_path: str = None, decoder_pretrained_model_name_or_path: str = None, *model_args, - **kwargs + **kwargs, ) -> PreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py index 4b163965486308..12453fde98125b 100644 --- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py @@ -227,9 +227,8 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): - if not _do_init: raise ValueError( "`FlaxVisionTextDualEncoderModel` cannot be created without initializing, `_do_init` must be `True`." diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index d9c075f6d9f34a..3e39c6b005a979 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -169,7 +169,6 @@ def __init__( vision_model: Optional[PreTrainedModel] = None, text_model: Optional[PreTrainedModel] = None, ): - if config is None and (vision_model is None or text_model is None): raise ValueError("Either a configuration or an vision and a text model has to be provided") diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py index f256a286a0bcd8..a7282ef2bb5387 100644 --- a/src/transformers/models/visual_bert/configuration_visual_bert.py +++ b/src/transformers/models/visual_bert/configuration_visual_bert.py @@ -130,7 +130,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index d9250d73b17086..38ec21aff57b7c 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -1587,7 +1587,6 @@ def forward( loss = None if labels is not None: - # scores = batch x selected position x visual_feature # scores = selected_positions.bmm(visual_features.transpose(1,2)) # label = batch x selected_postion x needed position diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py index dcb3ac795217ab..ed760189634f58 100644 --- a/src/transformers/models/vit/configuration_vit.py +++ b/src/transformers/models/vit/configuration_vit.py @@ -106,7 +106,7 @@ def __init__( num_channels=3, qkv_bias=True, encoder_stride=16, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -127,7 +127,6 @@ def __init__( class ViTOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py index 1a8ba21a658b8b..ceedc2603a74bf 100644 --- a/src/transformers/models/vit/convert_dino_to_pytorch.py +++ b/src/transformers/models/vit/convert_dino_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel from transformers.utils import logging diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py index bc1f7f72dd5f3c..718e8624cc8f4a 100644 --- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py +++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py @@ -19,12 +19,12 @@ import json from pathlib import Path -import torch -from PIL import Image - import requests import timm +import torch from huggingface_hub import hf_hub_download +from PIL import Image + from transformers import DeiTFeatureExtractor, ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel from transformers.utils import logging diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py index 4b089443b58f09..5ca0d932880875 100644 --- a/src/transformers/models/vit/image_processing_vit.py +++ b/src/transformers/models/vit/image_processing_vit.py @@ -81,7 +81,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"height": 224, "width": 224} @@ -101,7 +101,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image to `(size["height"], size["width"])`. @@ -157,7 +157,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py index ff9230777bbb6b..1ab2671efd75bf 100644 --- a/src/transformers/models/vit/modeling_flax_vit.py +++ b/src/transformers/models/vit/modeling_flax_vit.py @@ -85,7 +85,6 @@ class FlaxViTPatchEmbeddings(nn.Module): - config: ViTConfig dtype: jnp.dtype = jnp.float32 # the dtype of the computation @@ -356,7 +355,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None @@ -444,7 +442,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) if input_shape is None: @@ -523,7 +521,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - hidden_states = self.embeddings(pixel_values, deterministic=deterministic) outputs = self.encoder( diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index acc4f68bf38efb..6d0c579a43db0a 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -65,7 +65,6 @@ def __init__(self, config: ViTConfig, **kwargs): self.config = config def build(self, input_shape: tf.TensorShape): - num_patches = self.patch_embeddings.num_patches self.cls_token = self.add_weight( shape=(1, 1, self.config.hidden_size), @@ -496,7 +495,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -685,7 +683,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - outputs = self.vit( pixel_values=pixel_values, head_mask=head_mask, diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 79f61a392b7375..1fb4c4f1022eef 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -248,7 +248,6 @@ def __init__(self, config: ViTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -304,7 +303,6 @@ def __init__(self, config: ViTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py index abc9920782e07f..34b778a1f94414 100644 --- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py @@ -107,7 +107,7 @@ def __init__( num_channels=3, backbone_featmap_shape=[1, 1024, 24, 24], qkv_bias=True, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py index ad5847360f980e..e88ee246ba1c1d 100644 --- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py +++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py @@ -19,14 +19,14 @@ import json from pathlib import Path -import torch -from PIL import Image - import requests import timm +import torch from huggingface_hub import hf_hub_download +from PIL import Image from timm.data import resolve_data_config from timm.data.transforms_factory import create_transform + from transformers import ( BitConfig, ViTHybridConfig, diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py index 12bb63fc66feeb..8d7b5b83afe4b9 100644 --- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py @@ -102,7 +102,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_convert_rgb: bool = True, - **kwargs + **kwargs, ) -> None: super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 224} @@ -128,7 +128,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge @@ -155,7 +155,7 @@ def center_crop( image: np.ndarray, size: Dict[str, int], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the @@ -179,7 +179,7 @@ def rescale( image: np.ndarray, scale: Union[int, float], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ): """ Rescale an image by a scale factor. image = image * scale. @@ -200,7 +200,7 @@ def normalize( mean: Union[float, List[float]], std: Union[float, List[float]], data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Normalize an image. image = (image - image_mean) / image_std. @@ -233,7 +233,7 @@ def preprocess( do_convert_rgb: bool = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py index 650e065cf3caaf..df35c13f689dca 100644 --- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py @@ -267,7 +267,6 @@ def __init__(self, config: ViTHybridConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -325,7 +324,6 @@ def __init__(self, config: ViTHybridConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py index b7167fbbdc149a..4c065421a9f19d 100644 --- a/src/transformers/models/vit_mae/configuration_vit_mae.py +++ b/src/transformers/models/vit_mae/configuration_vit_mae.py @@ -116,7 +116,7 @@ def __init__( decoder_intermediate_size=2048, mask_ratio=0.75, norm_pix_loss=False, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py index 4cf9a75b674b42..fc61d8924c8d01 100644 --- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py +++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py @@ -16,10 +16,10 @@ import argparse +import requests import torch from PIL import Image -import requests from transformers import ViTMAEConfig, ViTMAEFeatureExtractor, ViTMAEForPreTraining diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 119b7ff3783444..ef0c7c9f36869e 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -388,7 +388,6 @@ def __init__(self, config: ViTMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -446,7 +445,6 @@ def __init__(self, config: ViTMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vit_msn/configuration_vit_msn.py b/src/transformers/models/vit_msn/configuration_vit_msn.py index 057824e5d4e133..87d9a37a68e067 100644 --- a/src/transformers/models/vit_msn/configuration_vit_msn.py +++ b/src/transformers/models/vit_msn/configuration_vit_msn.py @@ -98,7 +98,7 @@ def __init__( patch_size=16, num_channels=3, qkv_bias=True, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py index f04d26d5eb886a..482073a4faa005 100644 --- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py +++ b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py @@ -17,11 +17,11 @@ import argparse import json +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import ViTFeatureExtractor, ViTMSNConfig, ViTMSNModel from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index 61faa34eb978f4..d2c8547aa0d83a 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -238,7 +238,6 @@ def __init__(self, config: ViTMSNConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -296,7 +295,6 @@ def __init__(self, config: ViTMSNConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index ab7d116118171f..7afcd3f0ee28e2 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -253,7 +253,7 @@ def __init__( adapter_stride=2, num_adapter_layers=3, output_hidden_size=None, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py index c02a23d8a44060..9550b7c2a9ef90 100644 --- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py +++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py @@ -71,7 +71,7 @@ def __init__( padding_value=0.0, return_attention_mask=False, do_normalize=True, - **kwargs + **kwargs, ): super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) self.return_attention_mask = return_attention_mask @@ -109,7 +109,7 @@ def __call__( return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, sampling_rate: Optional[int] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index 364d1590efa9d7..86cfb5e089ea00 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -17,12 +17,11 @@ from functools import partial from typing import Optional, Tuple, Union -import numpy as np - import flax import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen.attention import dot_product_attention_weights from flax.traverse_util import flatten_dict, unflatten_dict @@ -663,7 +662,6 @@ def __call__( output_hidden_states=False, return_dict=True, ): - if attention_mask is not None: # make sure padded tokens are not attended to hidden_states = jnp.where( @@ -1034,7 +1032,6 @@ def _conv_out_length(input_length, kernel_size, stride): def _get_feature_vector_attention_mask( self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None ): - # Effectively attention_mask.sum(-1), but not inplace to be able to run # on inference mode. non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1] diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index dfaf53099c0f5a..a3f2fd0e1e175f 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -357,7 +357,6 @@ def __init__( self._check_axis() def build(self, input_shape): - self._check_if_input_shape_is_none(input_shape) self._set_number_of_groups_for_instance_norm(input_shape) self._check_size_of_dimensions(input_shape) @@ -369,7 +368,6 @@ def build(self, input_shape): super().build(input_shape) def call(self, inputs): - input_shape = tf.keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) @@ -406,7 +404,6 @@ def compute_output_shape(self, input_shape): return input_shape def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): - group_shape = [tensor_input_shape[i] for i in range(len(input_shape))] is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -419,7 +416,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): - group_shape = tf.keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 @@ -471,7 +467,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape): self.groups = dim def _check_size_of_dimensions(self, input_shape): - dim = input_shape[self.axis] if dim < self.groups: raise ValueError( @@ -492,19 +487,16 @@ def _check_size_of_dimensions(self, input_shape): ) def _check_axis(self): - if self.axis == 0: raise ValueError( "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead" ) def _create_input_spec(self, input_shape): - dim = input_shape[self.axis] self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) @@ -520,7 +512,6 @@ def _add_gamma_weight(self, input_shape): self.gamma = None def _add_beta_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) @@ -1684,7 +1675,6 @@ def call( logits = self.lm_head(hidden_states) if labels is not None: - if tf.reduce_max(labels) >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index c364d52807c615..3aa983f8e570ba 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1692,7 +1692,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index f4c4bea0722ffe..42fd1131cf0f26 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -168,7 +168,7 @@ def __init__( word_delimiter_token="|", replace_word_delimiter_char=" ", do_lower_case=False, - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, @@ -424,7 +424,7 @@ def batch_decode( clean_up_tokenization_spaces: bool = True, output_char_offsets: bool = False, output_word_offsets: bool = False, - **kwargs + **kwargs, ) -> List[str]: """ Convert a list of lists of token ids into a list of strings by calling decode. @@ -494,7 +494,7 @@ def decode( clean_up_tokenization_spaces: bool = True, output_char_offsets: bool = False, output_word_offsets: bool = False, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special @@ -735,7 +735,7 @@ def __init__( do_lower_case=False, do_normalize=False, return_attention_mask=False, - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, @@ -803,7 +803,7 @@ def __call__( pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -888,7 +888,7 @@ def _decode( token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, - **kwargs + **kwargs, ) -> str: """ special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py index 2a3f951b3960e6..b9f24c7e708057 100644 --- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py @@ -266,7 +266,7 @@ def __init__( max_source_positions=5000, conv_depthwise_kernel_size=31, conformer_conv_dropout=0.1, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index c1dc7ae94a77cc..12ce81465fd806 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1604,10 +1604,10 @@ def __init__(self, config): if config.vocab_size is None: raise ValueError( - f"You are trying to instantiate {self.__class__} with a configuration that does not define the" - " vocabulary size of the language model head. Please instantiate the model as follows:" - " `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of" - " your model's configuration." + f"You are trying to instantiate {self.__class__} with a configuration that " + "does not define the vocabulary size of the language model head. Please " + "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. " + "or define `vocab_size` of your model's configuration." ) output_hidden_size = ( config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size @@ -1668,7 +1668,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") @@ -1721,8 +1720,7 @@ def __init__(self, config): if hasattr(config, "add_adapter") and config.add_adapter: raise ValueError( - "Sequence classification does not support the use of Wav2Vec2Conformer adapters" - " (config.add_adapter=True)" + "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)" ) self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings @@ -1833,8 +1831,7 @@ def __init__(self, config): if hasattr(config, "add_adapter") and config.add_adapter: raise ValueError( - "Audio frame classification does not support the use of Wav2Vec2Conformer adapters" - " (config.add_adapter=True)" + "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)" ) self.wav2vec2_conformer = Wav2Vec2ConformerModel(config) num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index c983c4be826430..74e2d3525b01c8 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -141,7 +141,7 @@ def __init__( do_phonemize=True, phonemizer_lang="en-us", phonemizer_backend="espeak", - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, @@ -453,7 +453,7 @@ def decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, output_char_offsets: bool = False, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special @@ -509,7 +509,7 @@ def batch_decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, output_char_offsets: bool = False, - **kwargs + **kwargs, ) -> List[str]: """ Convert a list of lists of token ids into a list of strings by calling decode. diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py index 4c97657d026b42..becbe100240d88 100644 --- a/src/transformers/models/wavlm/configuration_wavlm.py +++ b/src/transformers/models/wavlm/configuration_wavlm.py @@ -246,7 +246,7 @@ def __init__( adapter_stride=2, num_adapter_layers=3, output_hidden_size=None, - **kwargs + **kwargs, ): super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id) self.hidden_size = hidden_size diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py index 91758cc9595290..84e3d231ea3845 100644 --- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py @@ -19,8 +19,6 @@ import torch -from transformers import WavLMConfig, WavLMModel, logging - # Step 1. clone https://github.com/microsoft/unilm # Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd # Step 3. cd unilm @@ -29,6 +27,8 @@ from unilm.wavlm.WavLM import WavLM as WavLMOrig from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig +from transformers import WavLMConfig, WavLMModel, logging + logging.set_verbosity_info() logger = logging.get_logger(__name__) @@ -179,7 +179,6 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro @torch.no_grad() def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - # load the pre-trained checkpoints checkpoint = torch.load(checkpoint_path) cfg = WavLMConfigOrig(checkpoint["cfg"]) diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index e4813447168888..f4e9305d6bcd9e 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1352,7 +1352,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py index 56159eb9359b14..934d54be48fcfb 100644 --- a/src/transformers/models/whisper/configuration_whisper.py +++ b/src/transformers/models/whisper/configuration_whisper.py @@ -185,7 +185,7 @@ def __init__( eos_token_id=50256, suppress_tokens=None, begin_suppress_tokens=[220, 50256], - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.num_mel_bins = num_mel_bins diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py index d838515fec859d..7c2e0c40a0422f 100644 --- a/src/transformers/models/whisper/convert_openai_to_hf.py +++ b/src/transformers/models/whisper/convert_openai_to_hf.py @@ -50,7 +50,6 @@ def remove_ignore_keys_(state_dict): "mlp.0": "fc1", "mlp.2": "fc2", "mlp_ln": "final_layer_norm", - "blocks": "layers", ".attn.query": ".self_attn.q_proj", ".attn.key": ".self_attn.k_proj", ".attn.value": ".self_attn.v_proj", diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py index 0b476ee34a302b..77c85a2477ade9 100644 --- a/src/transformers/models/whisper/feature_extraction_whisper.py +++ b/src/transformers/models/whisper/feature_extraction_whisper.py @@ -66,7 +66,7 @@ def __init__( n_fft=400, padding_value=0.0, return_attention_mask=False, # pad inputs to max length with silence token (zero) and no attention mask - **kwargs + **kwargs, ): super().__init__( feature_size=feature_size, @@ -225,7 +225,7 @@ def __call__( padding: Optional[str] = "max_length", max_length: Optional[int] = None, sampling_rate: Optional[int] = None, - **kwargs + **kwargs, ) -> BatchFeature: """ Main method to featurize and prepare for the model one or several sequence(s). diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index e8148366ac882c..dd6ebcb2c8ccaf 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -142,7 +142,7 @@ def __init__( dropout: float = 0.0, is_decoder: bool = False, bias: bool = True, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.embed_dim = embed_dim @@ -1363,7 +1363,7 @@ def prepare_inputs_for_generation( encoder_outputs=None, attention_mask=None, decoder_attention_mask=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 38bfe133a076b2..13e3ad7abbfb60 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -102,7 +102,6 @@ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional super().__init__(num_positions, embedding_dim) def forward(self, input_ids, past_key_values_length=0): - return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]] @@ -897,7 +896,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache =" @@ -923,7 +921,6 @@ def custom_forward(*inputs): None, # past_key_value ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -1244,7 +1241,7 @@ def generate( task=None, language=None, is_multilingual=None, - **kwargs + **kwargs, ): """ @@ -1383,7 +1380,7 @@ def prepare_inputs_for_generation( use_cache=None, encoder_outputs=None, attention_mask=None, - **kwargs + **kwargs, ): # cut decoder_input_ids if past is used if past_key_values is not None: diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 4f687b33448d2d..7e8f675a15b0eb 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -18,7 +18,6 @@ from typing import List, Optional, Tuple, Union import numpy as np - import regex as re from ...tokenization_utils import AddedToken, PreTrainedTokenizer @@ -269,9 +268,8 @@ def __init__( language=None, task=None, predict_timestamps=False, - **kwargs + **kwargs, ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token @@ -562,7 +560,7 @@ def decode( output_offsets: bool = False, time_precision=0.02, decode_with_timestamps: bool = False, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 5a005e7a571954..ed2bdea69a50ac 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -100,7 +100,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -118,7 +118,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the text config dict if we are loading from XCLIPConfig @@ -219,7 +218,7 @@ def __init__( initializer_range=0.02, initializer_factor=1.0, drop_path_rate=0.0, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -244,7 +243,6 @@ def __init__( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) # get the vision config dict if we are loading from XCLIPConfig @@ -311,7 +309,7 @@ def __init__( prompt_attention_dropout=0.0, prompt_projection_dropout=0.0, logit_scale_init_value=2.6592, - **kwargs + **kwargs, ): super().__init__(**kwargs) diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py index 8210b3f709e39a..d4281fe849d61c 100644 --- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py @@ -15,11 +15,11 @@ import argparse +import gdown import numpy as np import torch - -import gdown from huggingface_hub import hf_hub_download + from transformers import ( CLIPTokenizer, CLIPTokenizerFast, @@ -216,7 +216,6 @@ def prepare_video(num_frames): def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - model_to_url = { # fully supervised kinetics-400 checkpoints "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth", diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py index c9ac1111a08d78..8a59ee6682d6ea 100644 --- a/src/transformers/models/xglm/configuration_xglm.py +++ b/src/transformers/models/xglm/configuration_xglm.py @@ -114,7 +114,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py index f6ae740624094f..5a2a8951f1a805 100644 --- a/src/transformers/models/xglm/modeling_flax_xglm.py +++ b/src/transformers/models/xglm/modeling_flax_xglm.py @@ -20,11 +20,10 @@ from functools import partial from typing import Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen.attention import dot_product_attention_weights @@ -562,7 +561,7 @@ def __init__( seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) @@ -737,7 +736,6 @@ def __call__( return_dict: bool = True, deterministic: bool = True, ): - outputs = self.model( input_ids, attention_mask, diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index 2efcd4b1e32703..1dac55651563cc 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -809,7 +809,6 @@ def call( training: Optional[bool] = False, **kwargs: Any, ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: - outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index ff02620d429c9d..6fb34917437c7d 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -739,7 +739,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache =" @@ -765,7 +764,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py index 4dc6c3d37f7cd4..f27c827134bf37 100644 --- a/src/transformers/models/xglm/tokenization_xglm.py +++ b/src/transformers/models/xglm/tokenization_xglm.py @@ -124,7 +124,7 @@ def __init__( unk_token="", pad_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/xglm/tokenization_xglm_fast.py b/src/transformers/models/xglm/tokenization_xglm_fast.py index a0d4cebafeef38..834b8a47ecb920 100644 --- a/src/transformers/models/xglm/tokenization_xglm_fast.py +++ b/src/transformers/models/xglm/tokenization_xglm_fast.py @@ -110,7 +110,7 @@ def __init__( cls_token="", unk_token="", pad_token="", - **kwargs + **kwargs, ): # Compatibility with the original tokenizer self.num_madeup_words = 7 diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py index 52a53d5bf0060e..cd8d721bfc37d2 100644 --- a/src/transformers/models/xlm/configuration_xlm.py +++ b/src/transformers/models/xlm/configuration_xlm.py @@ -192,7 +192,7 @@ def __init__( lang_id=0, pad_token_id=2, bos_token_id=0, - **kwargs + **kwargs, ): """Constructs XLMConfig.""" self.vocab_size = vocab_size diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index d054f535a42c4e..2d0adcddf95c21 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -102,7 +102,6 @@ def get_masks(slen, lengths, causal, padding_mask=None): class MultiHeadAttention(nn.Module): - NEW_ID = itertools.count() def __init__(self, n_heads, dim, config): diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py index 8bb021c5b96987..cbfb2b48ff0f1f 100644 --- a/src/transformers/models/xlm/tokenization_xlm.py +++ b/src/transformers/models/xlm/tokenization_xlm.py @@ -611,7 +611,7 @@ def __init__( lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True, - **kwargs + **kwargs, ): super().__init__( unk_token=unk_token, diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py index cdca20ef3b43a8..29c8678f279981 100644 --- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py @@ -134,7 +134,7 @@ def __init__( pad_token_id: Optional[int] = 0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, - **kwargs + **kwargs, ): self.vocab_size = vocab_size self.hidden_size = hidden_size diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py index c8b6fee9facd99..0327bb4f08e200 100644 --- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -681,7 +681,6 @@ def forward( past_key_value: Optional[Tuple[Tensor]] = None, output_attentions: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: - batch_size, tgt_len, hidden_size = hidden_states.size() # if key_value_states are provided this layer is used as a cross-attention layer @@ -1612,7 +1611,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index af8308287939f8..8468eb49d64d5a 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -141,7 +141,7 @@ def __init__( cls_token="[CLS]", mask_token="[MASK]", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py index c8bad5905a489d..98e12d07826edc 100644 --- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py @@ -132,7 +132,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py index 9ae7a9b8b8c9c1..ed5e113770e544 100644 --- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py @@ -17,11 +17,10 @@ from typing import Callable, Optional, Tuple -import numpy as np - import flax.linen as nn import jax import jax.numpy as jnp +import numpy as np from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.linen import combine_masks, make_causal_mask from flax.linen import partitioning as nn_partitioning @@ -752,7 +751,7 @@ def __init__( dtype: jnp.dtype = jnp.float32, _do_init: bool = True, gradient_checkpointing: bool = False, - **kwargs + **kwargs, ): module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py index 5377bfdec19e30..b5fc694148e724 100644 --- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py @@ -723,7 +723,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: - if not self.config.is_decoder: use_cache = False diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 1807cc67d524b7..30d75a29496667 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -502,7 +502,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 40928d8dc30623..54a46842ff156d 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -145,7 +145,7 @@ def __init__( pad_token="", mask_token="", sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py index f99e3c086a88c5..68f8df06ff3d24 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py @@ -147,7 +147,7 @@ def __init__( unk_token="", pad_token="", mask_token="", - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py index a36fe66de66c5c..acf30bf3878a88 100644 --- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py @@ -119,7 +119,7 @@ def __init__( position_embedding_type="absolute", use_cache=True, classifier_dropout=None, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index a54d6835b92846..9250fa96398470 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -491,7 +491,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py index 3aa05f77c985c0..9ebc1f8bb9fb6f 100644 --- a/src/transformers/models/xlnet/configuration_xlnet.py +++ b/src/transformers/models/xlnet/configuration_xlnet.py @@ -176,7 +176,7 @@ def __init__( pad_token_id=5, bos_token_id=1, eos_token_id=2, - **kwargs + **kwargs, ): """Constructs XLNetConfig.""" self.vocab_size = vocab_size diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index 42ec90efc4d46a..080dd91f2301cd 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -597,7 +597,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ): - if training and use_mems is None: use_mems = self.use_mems_train else: diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 07362fe29c53f7..d6ed227ff886aa 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -1080,7 +1080,6 @@ def forward( return_dict: Optional[bool] = None, **kwargs, # delete after depreciation warning is removed ) -> Union[Tuple, XLNetModelOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index 9dc6fd24596419..5ad655c4173554 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -145,7 +145,7 @@ def __init__( mask_token="", additional_special_tokens=["", ""], sp_model_kwargs: Optional[Dict[str, Any]] = None, - **kwargs + **kwargs, ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token @@ -256,7 +256,7 @@ def _decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, spaces_between_special_tokens: bool = True, - **kwargs + **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py index c27c5262f94c1f..3f6ff200a84ac3 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py +++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py @@ -142,7 +142,7 @@ def __init__( cls_token="", mask_token="", additional_special_tokens=["", ""], - **kwargs + **kwargs, ): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py index c6bfbff444dd4f..538f85b1eb29e8 100644 --- a/src/transformers/models/yolos/configuration_yolos.py +++ b/src/transformers/models/yolos/configuration_yolos.py @@ -129,7 +129,7 @@ def __init__( bbox_loss_coefficient=5, giou_loss_coefficient=2, eos_coefficient=0.1, - **kwargs + **kwargs, ): super().__init__(**kwargs) @@ -160,7 +160,6 @@ def __init__( class YolosOnnxConfig(OnnxConfig): - torch_onnx_minimum_version = version.parse("1.11") @property diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py index d953936e24ff01..02279518832bd1 100644 --- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py +++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py @@ -19,11 +19,11 @@ import json from pathlib import Path +import requests import torch +from huggingface_hub import hf_hub_download from PIL import Image -import requests -from huggingface_hub import hf_hub_download from transformers import YolosConfig, YolosFeatureExtractor, YolosForObjectDetection from transformers.utils import logging diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index afe6f55bd88a71..f49d5d14fd5edd 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -698,7 +698,7 @@ def __init__( image_mean: Union[float, List[float]] = None, image_std: Union[float, List[float]] = None, do_pad: bool = True, - **kwargs + **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") @@ -811,7 +811,7 @@ def resize( size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, data_format: Optional[ChannelDimension] = None, - **kwargs + **kwargs, ) -> np.ndarray: """ Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an @@ -955,7 +955,7 @@ def preprocess( format: Optional[Union[str, AnnotionFormat]] = None, return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - **kwargs + **kwargs, ) -> BatchFeature: """ Preprocess an image or a batch of images so that it can be used by the model. diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 1921b87a9fcb9c..e3cb02ceae6ec0 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -322,7 +322,6 @@ def __init__(self, config: YolosConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -380,7 +379,6 @@ def __init__(self, config: YolosConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py index 7a2458146c6f59..c6d2b176ef947d 100644 --- a/src/transformers/models/yoso/configuration_yoso.py +++ b/src/transformers/models/yoso/configuration_yoso.py @@ -120,7 +120,7 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - **kwargs + **kwargs, ): super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py index 2b9a2c7cd85338..be46a4de81b30c 100644 --- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py +++ b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py @@ -75,7 +75,6 @@ def convert_checkpoint_helper(max_position_embeddings, orig_state_dict): def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path): - orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"] config = YosoConfig.from_json_file(yoso_config_file) model = YosoForMaskedLM(config) diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index df1ec304b952e2..cf0c814ff95080 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -98,7 +98,6 @@ def normalize(input_tensors): def hashing(query, key, num_hash, hash_len): - if len(query.size()) != 3: raise ValueError("Query has incorrect size.") if len(key.size()) != 3: diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index e953207b3a595b..ee9c498e73ba28 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -246,9 +246,8 @@ def export_tensorflow( `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from the ONNX configuration. """ - import tensorflow as tf - import onnx + import tensorflow as tf import tf2onnx if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None: diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index 3c833875a87744..e3a2e59c37f801 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -219,7 +219,7 @@ def __init__( include_in_weight_decay: Optional[List[str]] = None, exclude_from_weight_decay: Optional[List[str]] = None, name: str = "AdamWeightDecay", - **kwargs + **kwargs, ): super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) self.weight_decay_rate = weight_decay_rate diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index f3645c6cb61af4..434009d7f293ea 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -24,9 +24,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from numpy import isin - from huggingface_hub import model_info +from numpy import isin from ..configuration_utils import PretrainedConfig from ..dynamic_module_utils import get_class_from_dynamic_module diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py index a58247d4128790..7f104e74e12824 100644 --- a/src/transformers/pipelines/audio_classification.py +++ b/src/transformers/pipelines/audio_classification.py @@ -15,7 +15,6 @@ from typing import Union import numpy as np - import requests from ..utils import add_end_docstrings, is_torch_available, logging diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 8c552cbdc307a5..c8833ed3bd9303 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -15,7 +15,6 @@ from typing import TYPE_CHECKING, Dict, Optional, Union import numpy as np - import requests from ..utils import is_torch_available, logging @@ -302,7 +301,7 @@ def __init__( feature_extractor: Union["SequenceFeatureExtractor", str], *, decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None, - **kwargs + **kwargs, ): super().__init__(**kwargs) self.feature_extractor = feature_extractor diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index c4158f3cc4818c..db349944a4fda5 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -165,7 +165,6 @@ def _forward(self, model_inputs): def postprocess( self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5 ): - fn = None if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"): fn = self.image_processor.post_process_panoptic_segmentation diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index d4bb7f2102900d..597a0980e24e05 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -307,7 +307,7 @@ def _sanitize_parameters( max_question_len=None, handle_impossible_answer=None, align_to_words=None, - **kwargs + **kwargs, ): # Set defaults values preprocess_params = {} diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py index 615037a6d96a91..c01d7e49053a91 100644 --- a/src/transformers/pipelines/table_question_answering.py +++ b/src/transformers/pipelines/table_question_answering.py @@ -23,7 +23,6 @@ if is_tf_available() and is_tensorflow_probability_available(): import tensorflow as tf - import tensorflow_probability as tfp from ..models.auto.modeling_tf_auto import ( diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py index a9f73218ad54fb..bb8e860ce9aeee 100644 --- a/src/transformers/pipelines/text2text_generation.py +++ b/src/transformers/pipelines/text2text_generation.py @@ -78,7 +78,7 @@ def _sanitize_parameters( clean_up_tokenization_spaces=None, truncation=None, stop_sequence=None, - **generate_kwargs + **generate_kwargs, ): preprocess_params = {} if truncation is not None: diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py index b19d58f4ffbb44..2800515c821a42 100644 --- a/src/transformers/pipelines/text_generation.py +++ b/src/transformers/pipelines/text_generation.py @@ -97,7 +97,7 @@ def _sanitize_parameters( prefix=None, handle_long_generation=None, stop_sequence=None, - **generate_kwargs + **generate_kwargs, ): preprocess_params = {} if prefix is not None: diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py index e126bb7b1ea11a..5dc243d1acf276 100644 --- a/src/transformers/pipelines/token_classification.py +++ b/src/transformers/pipelines/token_classification.py @@ -22,7 +22,6 @@ class TokenClassificationArgumentHandler(ArgumentHandler): """ def __call__(self, inputs: Union[str, List[str]], **kwargs): - if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0: inputs = list(inputs) batch_size = len(inputs) @@ -141,7 +140,6 @@ def _sanitize_parameters( aggregation_strategy: Optional[AggregationStrategy] = None, offset_mapping: Optional[List[Tuple[int, int]]] = None, ): - preprocess_params = {} if offset_mapping is not None: preprocess_params["offset_mapping"] = offset_mapping diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py index 785a54e7d7a3c3..41a1699a7e82cf 100644 --- a/src/transformers/pipelines/video_classification.py +++ b/src/transformers/pipelines/video_classification.py @@ -9,7 +9,6 @@ if is_decord_available(): import numpy as np - from decord import VideoReader @@ -85,7 +84,6 @@ def __call__(self, videos: Union[str, List[str]], **kwargs): return super().__call__(videos, **kwargs) def preprocess(self, video, num_frames=None, frame_sampling_rate=1): - if num_frames is None: num_frames = self.model.config.num_frames diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py index cd4ff60c03d333..cf05999861c05f 100644 --- a/src/transformers/pipelines/zero_shot_object_detection.py +++ b/src/transformers/pipelines/zero_shot_object_detection.py @@ -66,7 +66,7 @@ def __call__( self, image: Union[str, "Image.Image", List[Dict[str, Any]]], candidate_labels: Union[str, List[str]] = None, - **kwargs + **kwargs, ): """ Detect objects (bounding boxes & classes) in the image(s) passed as inputs. @@ -168,7 +168,6 @@ def _forward(self, model_inputs): return model_outputs def postprocess(self, model_outputs, threshold=0.1, top_k=None): - results = [] for model_output in model_outputs: label = model_output["candidate_label"] diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 3a8ee66218bf60..821fd8c545f721 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -35,6 +35,7 @@ from unittest import mock import huggingface_hub + from transformers import logging as transformers_logging from .deepspeed import is_deepspeed_available @@ -778,6 +779,7 @@ def get_tests_dir(append_path=None): # The original code came from: # https://github.com/fastai/fastai/blob/master/tests/utils/text.py + # When any function contains print() calls that get overwritten, like progress bars, # a special care needs to be applied, since under pytest -s captured output (capsys # or contextlib.redirect_stdout) contains any temporary printed strings, followed by diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 6d33266c03f4e7..4dbbee4144c552 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -609,7 +609,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: def get_input_ids(text): if isinstance(text, str): @@ -693,7 +693,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: def get_input_ids(text): if isinstance(text, str): @@ -924,7 +924,7 @@ def _decode( skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, spaces_between_special_tokens: bool = True, - **kwargs + **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 8f3c392d382227..77a8db79520dd1 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1824,7 +1824,7 @@ def _from_pretrained( cache_dir=None, local_files_only=False, _commit_hash=None, - **kwargs + **kwargs, ): # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json # file or if `from_slow` is set to True. @@ -1932,7 +1932,6 @@ def convert_added_tokens(obj: Union[AddedToken, Any]): model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] if model_max_length is not None and isinstance(model_max_length, (int, float)): - model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) # TODO(PVP) - uncomment following line in Transformers v5 # init_kwargs["model_max_length"] = model_max_length @@ -2278,7 +2277,7 @@ def encode( max_length: Optional[int] = None, stride: int = 0, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs + **kwargs, ) -> List[int]: """ Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. @@ -2474,7 +2473,7 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of @@ -2558,7 +2557,7 @@ def _call_one( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: # Input type checking for clearer error def _is_valid_text_input(t): @@ -2671,7 +2670,7 @@ def encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a sequence or a pair of sequences. @@ -2743,7 +2742,7 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: raise NotImplementedError @@ -2773,7 +2772,7 @@ def batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: """ Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. @@ -2846,7 +2845,7 @@ def _batch_encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: raise NotImplementedError @@ -3083,7 +3082,7 @@ def prepare_for_model( return_length: bool = False, verbose: bool = True, prepend_batch_axis: bool = False, - **kwargs + **kwargs, ) -> BatchEncoding: """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It @@ -3271,8 +3270,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) @@ -3373,7 +3371,6 @@ def _pad( if self.padding_side == "right": if return_attention_mask: - encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = ( @@ -3415,7 +3412,7 @@ def batch_decode( sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, - **kwargs + **kwargs, ) -> List[str]: """ Convert a list of lists of token ids into a list of strings by calling decode. @@ -3448,7 +3445,7 @@ def decode( token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, - **kwargs + **kwargs, ) -> str: """ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special @@ -3484,7 +3481,7 @@ def _decode( token_ids: Union[int, List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, - **kwargs + **kwargs, ) -> str: raise NotImplementedError diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 4d13e34742370a..bcdbd8325bb351 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -411,7 +411,6 @@ def _batch_encode_plus( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: - if not isinstance(batch_text_or_text_pairs, (tuple, list)): raise TypeError( f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})" @@ -495,9 +494,8 @@ def _encode_plus( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - **kwargs + **kwargs, ) -> BatchEncoding: - batched_input = [(text, text_pair)] if text_pair else [text] batched_output = self._batch_encode_plus( batched_input, @@ -542,7 +540,7 @@ def _decode( token_ids: Union[int, List[int]], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, - **kwargs + **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index da140111cc2d4f..f10cc2f4b490ba 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -37,7 +37,8 @@ # Integrations must be imported before ML frameworks: -from .integrations import ( # isort: split +# isort: off +from .integrations import ( default_hp_search_backend, get_reporting_integration_callbacks, hp_params, @@ -52,16 +53,17 @@ run_hp_search_wandb, ) +# isort: on + import numpy as np import torch import torch.distributed as dist +from huggingface_hub import Repository, create_repo from packaging import version from torch import nn from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler -from huggingface_hub import Repository, create_repo - from . import __version__ from .configuration_utils import PretrainedConfig from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator @@ -1414,9 +1416,8 @@ def _wrap_model(self, model, training=True, dataloader=None): # Distributed training using PyTorch FSDP elif self.fsdp is not None: # PyTorch FSDP! - from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload + from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP - from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy if FSDPOption.OFFLOAD in self.args.fsdp: @@ -2004,7 +2005,6 @@ def _get_output_dir(self, trial): return run_dir def _load_from_checkpoint(self, resume_from_checkpoint, model=None): - if model is None: model = self.model @@ -2071,7 +2071,6 @@ def _load_best_model(self): model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model if os.path.exists(best_model_path): if self.deepspeed: - if self.model_wrapped is not None: # this removes the pre-hooks from the previous engine self.model_wrapped.destroy() @@ -2127,7 +2126,6 @@ def _load_best_model(self): ) def _issue_warnings_after_load(self, load_result): - if len(load_result.missing_keys) != 0: if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set( self.model._keys_to_ignore_on_save @@ -2684,7 +2682,6 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa if self.args.should_save: self._save(output_dir, state_dict=state_dict) elif self.deepspeed: - # this takes care of everything as long as we aren't under zero3 if self.args.should_save: self._save(output_dir) @@ -2983,7 +2980,6 @@ def evaluation_loop( # if eval is called w/o train init deepspeed here if args.deepspeed and not self.deepspeed: - # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually deepspeed_engine, _, _ = deepspeed_init( diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index cabc2ee51384d9..daab54a4e85831 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -595,6 +595,7 @@ class DistributedLengthGroupedSampler(DistributedSampler): Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. """ + # Copied and adapted from PyTorch DistributedSampler. def __init__( self, diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py index 1ee85ce84c8ad6..4a79516d265c1b 100644 --- a/src/transformers/trainer_seq2seq.py +++ b/src/transformers/trainer_seq2seq.py @@ -33,7 +33,7 @@ def evaluate( eval_dataset: Optional[Dataset] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", - **gen_kwargs + **gen_kwargs, ) -> Dict[str, float]: """ Run evaluation and returns metrics. @@ -82,7 +82,7 @@ def predict( test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test", - **gen_kwargs + **gen_kwargs, ) -> PredictionOutput: """ Run prediction and returns predictions and potential metrics. diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 737dd4deaf6887..1f6435b787a02a 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -23,11 +23,14 @@ # Integrations must be imported before ML frameworks: -from .integrations import ( # isort: split +# isort: off +from .integrations import ( is_comet_available, is_wandb_available, ) +# isort: on + import numpy as np import tensorflow as tf from tensorflow.python.distribute.values import PerReplica @@ -462,7 +465,6 @@ def prediction_step( @tf.function def distributed_prediction_steps(self, batch): - nb_instances_in_batch = self._compute_nb_instances(batch) inputs = self._get_step_inputs(batch, nb_instances_in_batch) @@ -516,7 +518,6 @@ def train(self) -> None: epochs_trained = 0 steps_trained_in_current_epoch = 0 if self.model.ckpt_manager.latest_checkpoint: - logger.info( f"Checkpoint file {self.model.ckpt_manager.latest_checkpoint} found and restoring from checkpoint" ) @@ -560,7 +561,6 @@ def train(self) -> None: self._past = None for step, batch in enumerate(train_ds): - # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -704,7 +704,6 @@ def apply_gradients(self, features, labels, nb_instances_in_global_batch): @tf.function def distributed_training_steps(self, batch): with self.args.strategy.scope(): - nb_instances_in_batch = self._compute_nb_instances(batch) inputs = self._get_step_inputs(batch, nb_instances_in_batch) @@ -712,7 +711,6 @@ def distributed_training_steps(self, batch): @staticmethod def _compute_nb_instances(batch): - labels = batch[-1] if isinstance(labels, PerReplica): labels = tf.concat(labels.values, axis=0) @@ -723,7 +721,6 @@ def _compute_nb_instances(batch): @staticmethod def _get_step_inputs(batch, nb_instances): - features, labels = batch if isinstance(labels, PerReplica): diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index e857a260c7a991..484e8895125769 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -398,7 +398,6 @@ class TrainerMemoryTracker: } def __init__(self, skip_memory_metrics=False): - self.skip_memory_metrics = skip_memory_metrics if not is_psutil_available(): diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index d73ab249833ca0..39952073bc4664 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1165,7 +1165,6 @@ def __post_init__(self): self.half_precision_backend = self.fp16_backend if self.bf16 or self.bf16_full_eval: - if self.no_cuda and not is_torch_bf16_cpu_available() and not is_torch_tpu_available(): # cpu raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10") diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py index 4e14dbaf77d332..936079f45dcbb1 100644 --- a/src/transformers/utils/bitsandbytes.py +++ b/src/transformers/utils/bitsandbytes.py @@ -4,11 +4,10 @@ if is_bitsandbytes_available(): + import bitsandbytes as bnb import torch import torch.nn as nn - import bitsandbytes as bnb - if is_accelerate_available(): from accelerate import init_empty_weights from accelerate.utils import find_tied_parameters diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 7e951fdb192107..339ec1fc783111 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -64,7 +64,6 @@ def _generate_supported_model_class_names( model_name: Type[PretrainedConfig], supported_tasks: Optional[Union[str, List[str]]] = None, ) -> List[str]: - task_mapping = { "default": MODEL_MAPPING_NAMES, "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES, @@ -692,7 +691,6 @@ class HFTracer(Tracer): ] def __init__(self, autowrap_modules=(math,), autowrap_functions=()): - super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions) if not is_torch_fx_available(): @@ -713,7 +711,6 @@ def _generate_dummy_input( inputs_dict = {} if input_name in ["labels", "start_positions", "end_positions"]: - batch_size = shape[0] if model_class_name in [ *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES), @@ -1147,7 +1144,6 @@ def symbolic_trace( input_names: Optional[List[str]] = None, disable_check: bool = False, ) -> GraphModule: - """ Performs symbolic tracing on the model. diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 4f8ea58d9dcaf2..bb3575edf24f98 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -48,6 +48,7 @@ hf_raise_for_status, ) from requests.exceptions import HTTPError + from transformers.utils.logging import tqdm from . import __version__, logging @@ -709,7 +710,7 @@ def push_to_hub( use_auth_token: Optional[Union[bool, str]] = None, max_shard_size: Optional[Union[int, str]] = "10GB", create_pr: bool = False, - **deprecated_kwargs + **deprecated_kwargs, ) -> str: """ Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py index a98e2f30fd6eb8..a1cce40dc2f121 100644 --- a/src/transformers/utils/logging.py +++ b/src/transformers/utils/logging.py @@ -18,19 +18,20 @@ import os import sys import threading -from logging import CRITICAL # NOQA -from logging import DEBUG # NOQA -from logging import ERROR # NOQA -from logging import FATAL # NOQA -from logging import INFO # NOQA -from logging import NOTSET # NOQA -from logging import WARN # NOQA -from logging import WARNING # NOQA +from logging import ( + CRITICAL, # NOQA + DEBUG, # NOQA + ERROR, # NOQA + FATAL, # NOQA + INFO, # NOQA + NOTSET, # NOQA + WARN, # NOQA + WARNING, # NOQA +) from typing import Optional -from tqdm import auto as tqdm_lib - import huggingface_hub.utils as hf_hub_utils +from tqdm import auto as tqdm_lib _lock = threading.Lock() @@ -67,17 +68,14 @@ def _get_default_logging_level(): def _get_library_name() -> str: - return __name__.split(".")[0] def _get_library_root_logger() -> logging.Logger: - return logging.getLogger(_get_library_name()) def _configure_library_root_logger() -> None: - global _default_handler with _lock: @@ -95,7 +93,6 @@ def _configure_library_root_logger() -> None: def _reset_library_root_logger() -> None: - global _default_handler with _lock: diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py index bcbe808013596f..b5d23417cec190 100644 --- a/src/transformers/utils/model_parallel_utils.py +++ b/src/transformers/utils/model_parallel_utils.py @@ -33,14 +33,12 @@ def assert_device_map(device_map, num_blocks): if len(duplicate_blocks) != 0: raise ValueError( "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device." - " These attention blocks were specified more than once: " - + str(duplicate_blocks) + " These attention blocks were specified more than once: " + str(duplicate_blocks) ) if len(missing_blocks) != 0: raise ValueError( "There are attention blocks for this model that are not specified in the device_map. Add these attention " - "blocks to a device on the device_map: " - + str(missing_blocks) + "blocks to a device on the device_map: " + str(missing_blocks) ) if len(extra_blocks) != 0: raise ValueError( diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 65644ef3ac6ddd..eea27ba7044f15 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -21,8 +21,8 @@ from copy import deepcopy import datasets - from parameterized import parameterized + from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa from transformers import AutoModel, TrainingArguments, is_torch_available, logging from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config @@ -271,7 +271,6 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T # --- These tests are enough to run on one of zero stages --- # def test_hf_ds_config_mismatch(self): - ds_config = self.get_config_dict(ZERO2) # Purposefully configure these values to mismatch TrainingArguments values. @@ -383,7 +382,6 @@ def test_stage3_nvme_offload(self): @require_optuna def test_hyperparameter_search(self): with mockenv_context(**self.dist_env_1_gpu): - ds_config_zero3_dict = self.get_config_dict(ZERO3) # hyperparameter_search requires model_init() to recreate the model for each trial @@ -599,7 +597,6 @@ def test_save_checkpoints(self, stage, dtype): @parameterized.expand(params, name_func=parameterized_custom_name_func) def test_can_resume_training_errors(self, stage, dtype): - with mockenv_context(**self.dist_env_1_gpu): ds_config_dict = self.get_config_dict(stage) output_dir = self.get_auto_remove_tmp_dir() @@ -765,7 +762,6 @@ def test_load_best_model(self, stage, dtype): ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True with mockenv_context(**self.dist_env_1_gpu): - args_dict = { "per_gpu_train_batch_size": 1, "per_gpu_eval_batch_size": 1, @@ -938,7 +934,6 @@ def test_inference(self, dtype): ) def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True): - if do_train: train_metrics = load_json(os.path.join(output_dir, "train_results.json")) self.assertIn("train_samples_per_second", train_metrics) @@ -966,7 +961,6 @@ def run_and_check( extra_args_str: str = None, remove_args_str: str = None, ): - # we are doing quality testing so using a small real model output_dir = self.run_trainer( stage=stage, diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index 6c1af5177671ed..984c7e756578b3 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -18,6 +18,7 @@ from os.path import dirname from parameterized import parameterized + from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa from transformers import is_torch_available from transformers.testing_utils import ( diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index d0fc5582160521..d86fb337af04c5 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -22,6 +22,7 @@ from unittest.mock import patch from parameterized import parameterized + from transformers.testing_utils import ( CaptureStderr, ExtendSysPath, @@ -361,7 +362,6 @@ def run_trainer( args += extra_args_str.split() if distributed: - if n_gpus_to_use is None: n_gpus_to_use = get_gpu_count() master_port = get_torch_dist_unique_port() diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py index 2c655254f2e5ad..8add735a0bf07b 100644 --- a/tests/generation/test_configuration_utils.py +++ b/tests/generation/test_configuration_utils.py @@ -20,6 +20,7 @@ from huggingface_hub import HfFolder, delete_repo, set_access_token from parameterized import parameterized from requests.exceptions import HTTPError + from transformers import AutoConfig, GenerationConfig from transformers.testing_utils import TOKEN, USER, is_staging_test diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py index 27dea2b029dd8f..a45d75ae244bb6 100644 --- a/tests/generation/test_flax_logits_process.py +++ b/tests/generation/test_flax_logits_process.py @@ -27,6 +27,7 @@ if is_flax_available(): import jax import jax.numpy as jnp + from transformers.generation import ( FlaxForcedBOSTokenLogitsProcessor, FlaxForcedEOSTokenLogitsProcessor, diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py index aabab559853bb0..c6182a2386e561 100644 --- a/tests/generation/test_flax_utils.py +++ b/tests/generation/test_flax_utils.py @@ -27,6 +27,7 @@ import jax.numpy as jnp from jax import jit + from transformers import AutoTokenizer, FlaxAutoModelForCausalLM from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py index 014ed4af1e6dc1..ad1263cb938a88 100644 --- a/tests/generation/test_framework_agnostic.py +++ b/tests/generation/test_framework_agnostic.py @@ -8,7 +8,6 @@ class GenerationIntegrationTestsMixin: - # To be populated by the child classes framework_dependent_parameters = { "AutoModelForSeq2SeqLM": None, diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py index 195188f10bfc64..a1f665c9a761fc 100644 --- a/tests/generation/test_tf_logits_process.py +++ b/tests/generation/test_tf_logits_process.py @@ -17,8 +17,8 @@ import unittest import numpy as np - from parameterized import parameterized + from transformers import is_tf_available from transformers.testing_utils import require_tf diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py index 7d500571896373..afd4cfef4f7fe3 100644 --- a/tests/generation/test_tf_utils.py +++ b/tests/generation/test_tf_utils.py @@ -36,7 +36,6 @@ @require_tf class UtilsFunctionsTest(unittest.TestCase): - # tests whether the top_k_top_p_filtering function behaves as expected def test_top_k_top_p_filtering(self): logits = tf.convert_to_tensor( @@ -133,7 +132,6 @@ def test_top_k_top_p_filtering(self): @require_tf class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin): - # setting framework_dependent_parameters needs to be gated, just like its contents' imports if is_tf_available(): framework_dependent_parameters = { diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 5bbefef8f1ffc6..d65a34947b0a9d 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1413,7 +1413,6 @@ def test_constrained_beam_search_generate_dict_output(self): def test_contrastive_generate(self): # check `generate()` and `contrastive_search()` are equal for model_class in self.all_generative_model_classes: - # won't fix: FSMT and Reformer have a different cache variable type (and format). if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): return @@ -1435,7 +1434,6 @@ def test_contrastive_generate(self): def test_contrastive_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: - # won't fix: FSMT and Reformer have a different cache variable type (and format). if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]): return @@ -1661,7 +1659,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2): @require_torch class UtilsFunctionsTest(unittest.TestCase): - # tests whether the top_k_top_p function behaves as expected def test_top_k_top_p_filtering(self): logits = torch.tensor( @@ -1792,7 +1789,6 @@ def test_top_k_top_p_filtering_with_filter_value(self): @require_torch class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin): - # setting framework_dependent_parameters needs to be gated, just like its contents' imports if is_torch_available(): framework_dependent_parameters = { diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index 9acb5ba99791ac..9cd62918db4e78 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class AlbertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( AlbertModel, diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py index 802952e52cbd85..5292665f55b794 100644 --- a/tests/models/albert/test_modeling_flax_albert.py +++ b/tests/models/albert/test_modeling_flax_albert.py @@ -24,6 +24,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.models.albert.modeling_flax_albert import ( FlaxAlbertForMaskedLM, FlaxAlbertForMultipleChoice, @@ -117,7 +118,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxAlbertModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( FlaxAlbertModel, diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py index ad10228a518225..873116c0d304fd 100644 --- a/tests/models/albert/test_modeling_tf_albert.py +++ b/tests/models/albert/test_modeling_tf_albert.py @@ -228,7 +228,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFAlbertModel, diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py index 5459917775d992..c25cfaec77b4aa 100644 --- a/tests/models/albert/test_tokenization_albert.py +++ b/tests/models/albert/test_tokenization_albert.py @@ -27,7 +27,6 @@ @require_sentencepiece @require_tokenizers class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index c22b9a468409d6..ff39329d8404d6 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -21,8 +21,8 @@ import unittest import numpy as np - import requests + from transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -292,7 +292,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (AltCLIPTextModel,) if is_torch_available() else () fx_compatible = True test_pruning = False @@ -343,7 +342,6 @@ def test_model_from_pretrained(self): class AltCLIPModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -395,7 +393,6 @@ def prepare_img(): @require_torch class AltCLIPModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (AltCLIPModel,) if is_torch_available() else () fx_compatible = True test_head_masking = False diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index cf6bb1d27f7920..6fd035af8d0471 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -102,7 +102,6 @@ def _flatten(list_of_lists): @require_torch @require_torchaudio class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = ASTFeatureExtractor def setUp(self): diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py index 90d748ebea4a85..b0d3140de52d35 100644 --- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py @@ -18,6 +18,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import ASTConfig from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_torchaudio_available @@ -225,7 +226,6 @@ def default_feature_extractor(self): @slow def test_inference_audio_classification(self): - feature_extractor = self.default_feature_extractor model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device) diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py index 0008aa101b45a2..9fb982c0f0e14e 100644 --- a/tests/models/auto/test_modeling_auto.py +++ b/tests/models/auto/test_modeling_auto.py @@ -42,8 +42,8 @@ if is_torch_available(): import torch - from test_module.custom_modeling import CustomModel + from transformers import ( AutoConfig, AutoModel, @@ -394,7 +394,6 @@ def test_cached_model_has_minimum_calls_to_head(self): self.assertEqual(counter.other_request_count, 0) def test_attr_not_existing(self): - from transformers.models.auto.auto_factory import _LazyAutoMapping _CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")]) diff --git a/tests/models/auto/test_modeling_flax_auto.py b/tests/models/auto/test_modeling_flax_auto.py index 26f80f97064788..5880551f54dac8 100644 --- a/tests/models/auto/test_modeling_flax_auto.py +++ b/tests/models/auto/test_modeling_flax_auto.py @@ -20,6 +20,7 @@ if is_flax_available(): import jax + from transformers.models.auto.modeling_flax_auto import FlaxAutoModel from transformers.models.bert.modeling_flax_bert import FlaxBertModel from transformers.models.roberta.modeling_flax_roberta import FlaxRobertaModel diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py index 107e3611db3fbb..a880bc0a084a5d 100644 --- a/tests/models/auto/test_processor_auto.py +++ b/tests/models/auto/test_processor_auto.py @@ -23,6 +23,7 @@ from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token from requests.exceptions import HTTPError + from transformers import ( CONFIG_MAPPING, FEATURE_EXTRACTOR_MAPPING, diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py index 7a1d1c5e8b4607..f97f49149817f0 100644 --- a/tests/models/bart/test_modeling_flax_bart.py +++ b/tests/models/bart/test_modeling_flax_bart.py @@ -33,6 +33,7 @@ import jax import jax.numpy as jnp + from transformers.models.bart.modeling_flax_bart import ( FlaxBartForConditionalGeneration, FlaxBartForQuestionAnswering, diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index 24ea6e1e5cd95b..5607d1d3d2e113 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self): @require_torch def test_special_tokens(self): - src_text = ["A long paragraph for summarization."] tgt_text = [ "Summary of the text.", diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py index 38acf046b4f3e9..fa128f5091b9f4 100644 --- a/tests/models/barthez/test_tokenization_barthez.py +++ b/tests/models/barthez/test_tokenization_barthez.py @@ -25,7 +25,6 @@ @require_sentencepiece @slow # see https://github.com/huggingface/transformers/issues/11457 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BarthezTokenizer rust_tokenizer_class = BarthezTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py index fc5ebfd19c4a14..1fc06e38e04507 100644 --- a/tests/models/bartpho/test_tokenization_bartpho.py +++ b/tests/models/bartpho/test_tokenization_bartpho.py @@ -26,7 +26,6 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BartphoTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/beit/test_image_processing_beit.py b/tests/models/beit/test_image_processing_beit.py index b9b809108f3b20..dca26e049298ff 100644 --- a/tests/models/beit/test_image_processing_beit.py +++ b/tests/models/beit/test_image_processing_beit.py @@ -105,7 +105,6 @@ def prepare_semantic_batch_inputs(): @require_torch @require_vision class BeitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = BeitImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py index 94ffda61eb97f2..75c58aec471731 100644 --- a/tests/models/beit/test_modeling_flax_beit.py +++ b/tests/models/beit/test_modeling_flax_beit.py @@ -27,6 +27,7 @@ if is_flax_available(): import jax + from transformers import FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling, FlaxBeitModel if is_vision_available(): @@ -140,7 +141,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxBeitModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = ( (FlaxBeitModel, FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling) if is_flax_available() else () ) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 367e5ee53c4026..8873ccb613665b 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -427,7 +427,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( BertModel, @@ -565,7 +564,6 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # BertForMultipleChoice behaves incorrectly in JIT environments. if model_class == BertForMultipleChoice: return diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py index 5516c4d6fe67fe..55ffb440196f17 100644 --- a/tests/models/bert/test_modeling_flax_bert.py +++ b/tests/models/bert/test_modeling_flax_bert.py @@ -133,7 +133,6 @@ def prepare_config_and_inputs_for_decoder(self): @require_flax class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase): - test_head_masking = True all_model_classes = ( diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py index 451f54325d13e0..86c42c3233be95 100644 --- a/tests/models/bert/test_modeling_tf_bert.py +++ b/tests/models/bert/test_modeling_tf_bert.py @@ -591,7 +591,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFBertModel, diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index dfbcd266c49917..c5ebd6dbf34a29 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -34,7 +34,6 @@ @require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BertTokenizer rust_tokenizer_class = BertTokenizerFast test_rust_tokenizer = True @@ -305,7 +304,6 @@ def test_change_tokenize_chinese_chars(self): text_with_chinese_char = "".join(list_of_commun_chinese_char) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - kwargs["tokenize_chinese_chars"] = True tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py index ebd8af2bb6f319..b2a6b5060bfd34 100644 --- a/tests/models/bert_generation/test_modeling_bert_generation.py +++ b/tests/models/bert_generation/test_modeling_bert_generation.py @@ -241,7 +241,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else () all_generative_model_classes = (BertGenerationDecoder,) if is_torch_available() else () diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py index 581f249db050cb..12be95d53ebd78 100644 --- a/tests/models/bert_generation/test_tokenization_bert_generation.py +++ b/tests/models/bert_generation/test_tokenization_bert_generation.py @@ -29,7 +29,6 @@ @require_sentencepiece class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BertGenerationTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 038a334cebc79d..3e840018bdc15b 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -36,7 +36,6 @@ @custom_tokenizers class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False space_between_special_tokens = True @@ -369,7 +368,6 @@ def test_sequence_builders(self): @custom_tokenizers class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py index 5f82fba516754b..2a4c643269c6da 100644 --- a/tests/models/bertweet/test_tokenization_bertweet.py +++ b/tests/models/bertweet/test_tokenization_bertweet.py @@ -22,7 +22,6 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BertweetTokenizer test_rust_tokenizer = False diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index ec8705607d6578..8859db16ecc913 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -430,7 +430,6 @@ def create_and_check_for_change_to_full_attn( @require_torch class BigBirdModelTest(ModelTesterMixin, unittest.TestCase): - # head masking & pruning is currently not supported for big bird test_head_masking = False test_pruning = False diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py index 7c4c7267216a62..997df338963927 100644 --- a/tests/models/big_bird/test_modeling_flax_big_bird.py +++ b/tests/models/big_bird/test_modeling_flax_big_bird.py @@ -24,6 +24,7 @@ if is_flax_available(): import jax + from transformers.models.big_bird.modeling_flax_big_bird import ( FlaxBigBirdForCausalLM, FlaxBigBirdForMaskedLM, @@ -134,7 +135,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( FlaxBigBirdForCausalLM, @@ -210,7 +210,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs): self.assertEqual(len(outputs), len(jitted_outputs)) for jitted_output, output in zip(jitted_outputs, outputs): - self.assertEqual(jitted_output.shape, output.shape) # overwrite from common in order to skip the check on `attentions` diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py index ff654510082574..fd4323cb0f57c6 100644 --- a/tests/models/big_bird/test_tokenization_big_bird.py +++ b/tests/models/big_bird/test_tokenization_big_bird.py @@ -30,7 +30,6 @@ @require_sentencepiece @require_tokenizers class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BigBirdTokenizer rust_tokenizer_class = BigBirdTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index b0eb86d43fc613..14d4900a5096ee 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -263,7 +263,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (BioGptModel, BioGptForCausalLM) if is_torch_available() else () all_generative_model_classes = (BioGptForCausalLM,) if is_torch_available() else () test_pruning = False diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py index 70dd9c24e95cbf..771a388d4a19a0 100644 --- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py @@ -34,6 +34,7 @@ import jax import jax.numpy as jnp + from transformers import BlenderbotTokenizer from transformers.models.blenderbot.modeling_flax_blenderbot import ( FlaxBlenderbotForConditionalGeneration, diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py index c0d58c0d1483ff..5252b9cb986ee7 100644 --- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py @@ -297,7 +297,6 @@ def tokenizer(self): @slow def test_90_generation_from_long_input(self): - src_text = [ "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel" " like i'm going to throw up.\nand why is that?" diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py index 695eb3b30dad12..d417ac3073d50b 100644 --- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py @@ -34,6 +34,7 @@ import jax import jax.numpy as jnp + from transformers.models.blenderbot_small.modeling_flax_blenderbot_small import ( FlaxBlenderbotSmallForConditionalGeneration, FlaxBlenderbotSmallModel, diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py index 7ea7f09b5764bf..b022e77682bdfb 100644 --- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py @@ -27,7 +27,6 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = BlenderbotSmallTokenizer test_rust_tokenizer = False diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py index f3f2ab21dfd4a8..245c722ed4031c 100644 --- a/tests/models/blip/test_image_processing_blip.py +++ b/tests/models/blip/test_image_processing_blip.py @@ -110,7 +110,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): @require_torch @require_vision class BlipImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = BlipImageProcessor if is_vision_available() else None def setUp(self): @@ -232,7 +231,6 @@ def test_call_pytorch(self): @require_torch @require_vision class BlipImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = BlipImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 800bd67989600d..c1c208d36bd7d6 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -21,8 +21,8 @@ import unittest import numpy as np - import requests + from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig from transformers.testing_utils import require_torch, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -301,7 +301,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BlipTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (BlipTextModel,) if is_torch_available() else () fx_compatible = False test_pruning = False @@ -345,7 +344,6 @@ def test_model_from_pretrained(self): class BlipModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -523,7 +521,6 @@ def test_model_from_pretrained(self): class BlipTextRetrievalModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -571,7 +568,6 @@ def prepare_config_and_inputs_for_common(self): class BlipTextImageModelsModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index 2e5e37ce2e96b3..6c7df61c5ffb80 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BlipTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (BlipTextModel,) if is_torch_available() else () fx_compatible = False test_pruning = False diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index ee703b035ac60c..85e6312e72117b 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -320,7 +320,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( BloomModel, @@ -450,7 +449,6 @@ def test_batch_generation(self): @slow @require_torch_gpu def test_batch_generation_padd(self): - path_560m = "bigscience/bloom-560m" model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda() model = model.eval() diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 117240dbdae19c..88ead384e0edcb 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -25,7 +25,6 @@ @require_tokenizers class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - slow_tokenizer_class = None rust_tokenizer_class = BloomTokenizerFast tokenizer_class = BloomTokenizerFast diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py index 85057c5278bbbd..70dba0a781c048 100644 --- a/tests/models/byt5/test_tokenization_byt5.py +++ b/tests/models/byt5/test_tokenization_byt5.py @@ -36,7 +36,6 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = ByT5Tokenizer test_rust_tokenizer = False diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index aff186d73cb065..6acabc7bf25dd7 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -31,7 +31,6 @@ @require_sentencepiece @require_tokenizers class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = CamembertTokenizer rust_tokenizer_class = CamembertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index cf45f10a833af8..92f27047a979c8 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class CanineModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( CanineModel, diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index 6ae27082ccebfa..a52ef3d784c80c 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -28,7 +28,6 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = CanineTokenizer test_rust_tokenizer = False @@ -188,7 +187,6 @@ def test_added_token_serializable(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # a special token for Canine can be defined as follows: NEW_TOKEN = 0xE006 new_token = chr(NEW_TOKEN) @@ -262,7 +260,6 @@ def test_encode_decode_with_spaces(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - input = "hello world" if self.space_between_special_tokens: output = "[CLS] hello world [SEP]" diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py index b7b31350713a9f..90b0ffbcd6c593 100644 --- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py +++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py @@ -114,7 +114,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): @require_torch @require_vision class ChineseCLIPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None def setUp(self): @@ -247,7 +246,6 @@ def test_call_pytorch(self): @require_torch @require_vision class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index 97e522b3b95e6f..637984264e3780 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -20,8 +20,8 @@ import unittest import numpy as np - import requests + from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig from transformers.models.auto import get_values from transformers.testing_utils import require_torch, require_vision, slow, torch_device @@ -316,7 +316,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else () fx_compatible = False @@ -478,7 +477,6 @@ def test_model_from_pretrained(self): class ChineseCLIPModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: diff --git a/tests/models/clip/test_image_processing_clip.py b/tests/models/clip/test_image_processing_clip.py index 7ffaceb54c68da..00a43a6bb43766 100644 --- a/tests/models/clip/test_image_processing_clip.py +++ b/tests/models/clip/test_image_processing_clip.py @@ -114,7 +114,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): @require_torch @require_vision class CLIPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = CLIPImageProcessor if is_vision_available() else None def setUp(self): @@ -247,7 +246,6 @@ def test_call_pytorch(self): @require_torch @require_vision class CLIPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = CLIPImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 8fa3ce21bb31e3..2e80df6c27bfd0 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -21,8 +21,8 @@ import unittest import numpy as np - import requests + import transformers from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig from transformers.testing_utils import ( @@ -67,6 +67,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -347,7 +348,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else () fx_compatible = True test_pruning = False @@ -402,7 +402,6 @@ def test_model_with_projection_from_pretrained(self): class CLIPModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -579,7 +578,6 @@ def test_equivalence_pt_to_flax(self): for model_class in self.all_model_classes: with self.subTest(model_class.__name__): - # load PyTorch class pt_model = model_class(config).eval() # Flax models don't use the `use_cache` option and cache is not returned as a default. diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py index b8a1030ad1b0c8..7d63fa9edf1774 100644 --- a/tests/models/clip/test_modeling_flax_clip.py +++ b/tests/models/clip/test_modeling_flax_clip.py @@ -14,6 +14,7 @@ if is_flax_available(): import jax import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py index 05b4c7920ebd81..88ad5be374d0e8 100644 --- a/tests/models/clip/test_modeling_tf_clip.py +++ b/tests/models/clip/test_modeling_tf_clip.py @@ -22,6 +22,7 @@ from importlib import import_module import requests + from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig from transformers.testing_utils import require_tf, require_vision, slow from transformers.utils import is_tf_available, is_vision_available @@ -396,7 +397,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFCLIPTextModel,) if is_tf_available() else () test_pruning = False test_head_masking = False diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index e9ba304b475dd6..fc958267105c32 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -27,7 +27,6 @@ @require_tokenizers class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = CLIPTokenizer rust_tokenizer_class = CLIPTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index f170f606533850..24a59e48dc3851 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -21,8 +21,8 @@ import unittest import numpy as np - import requests + import transformers from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig from transformers.models.auto import get_values @@ -60,6 +60,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -302,7 +303,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (CLIPSegTextModel,) if is_torch_available() else () fx_compatible = False test_pruning = False @@ -346,7 +346,6 @@ def test_model_from_pretrained(self): class CLIPSegModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: @@ -560,7 +559,6 @@ def test_equivalence_pt_to_flax(self): for model_class in self.all_model_classes: with self.subTest(model_class.__name__): - # load PyTorch class pt_model = model_class(config).eval() # Flax models don't use the `use_cache` option and cache is not returned as a default. diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py index 091a8b401d8dc9..c5818d23eac030 100644 --- a/tests/models/codegen/test_modeling_codegen.py +++ b/tests/models/codegen/test_modeling_codegen.py @@ -349,7 +349,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else () all_generative_model_classes = (CodeGenForCausalLM,) if is_torch_available() else () fx_compatible = False diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index c15c8236b8da14..ec7c11dcef9d80 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -28,7 +28,6 @@ @require_tokenizers class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = CodeGenTokenizer rust_tokenizer_class = CodeGenTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index f93d3cabbf2f11..ba77431467b830 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class ConditionalDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py index b8ab2c64725a25..28762ebcfb4677 100644 --- a/tests/models/convbert/test_modeling_convbert.py +++ b/tests/models/convbert/test_modeling_convbert.py @@ -246,7 +246,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ConvBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( ConvBertModel, @@ -419,7 +418,6 @@ def test_attention_outputs(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # ConvBertForMultipleChoice behaves incorrectly in JIT environments. if model_class == ConvBertForMultipleChoice: return diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py index ae675b878ed145..e9d8630a598593 100644 --- a/tests/models/convbert/test_modeling_tf_convbert.py +++ b/tests/models/convbert/test_modeling_tf_convbert.py @@ -224,7 +224,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFConvBertModel, diff --git a/tests/models/convnext/test_image_processing_convnext.py b/tests/models/convnext/test_image_processing_convnext.py index da7d28e64dbce4..ced0765c352866 100644 --- a/tests/models/convnext/test_image_processing_convnext.py +++ b/tests/models/convnext/test_image_processing_convnext.py @@ -77,7 +77,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class ConvNextImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ConvNextImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index 0f2149ecf9be17..71d83dd3bb9e47 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -193,7 +193,6 @@ def create_and_check_ctrl_for_sequence_classification(self, config, input_ids, h @require_torch class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else () all_generative_model_classes = (CTRLLMHeadModel,) if is_torch_available() else () test_pruning = True diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py index d3e82e57c9f27f..5c7a92d73c49bc 100644 --- a/tests/models/ctrl/test_modeling_tf_ctrl.py +++ b/tests/models/ctrl/test_modeling_tf_ctrl.py @@ -169,7 +169,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else () all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else () test_head_masking = False diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py index 0bd4d8c8065cba..02c3459f9e0461 100644 --- a/tests/models/ctrl/test_tokenization_ctrl.py +++ b/tests/models/ctrl/test_tokenization_ctrl.py @@ -23,7 +23,6 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = CTRLTokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py index 940a82db4398a2..91f69a11b55532 100644 --- a/tests/models/deberta/test_modeling_deberta.py +++ b/tests/models/deberta/test_modeling_deberta.py @@ -214,7 +214,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DebertaModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( DebertaModel, diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py index c2584db30f1976..d544a12e68c269 100644 --- a/tests/models/deberta/test_modeling_tf_deberta.py +++ b/tests/models/deberta/test_modeling_tf_deberta.py @@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFDebertaModel, diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py index 4aa53e13ff8d4e..81d7bd95bd8081 100644 --- a/tests/models/deberta/test_tokenization_deberta.py +++ b/tests/models/deberta/test_tokenization_deberta.py @@ -26,7 +26,6 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = DebertaTokenizer test_rust_tokenizer = True rust_tokenizer_class = DebertaTokenizerFast diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py index 8c9bf3bbf7e5c9..ad22f4f9a9489c 100644 --- a/tests/models/deberta_v2/test_modeling_deberta_v2.py +++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py @@ -227,7 +227,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( DebertaV2Model, diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py index b2cc8896e46ee5..bd4d05e6c13958 100644 --- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py +++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py @@ -210,7 +210,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFDebertaV2Model, diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index f2831315e5c29c..961cd82f548c3c 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -27,7 +27,6 @@ @require_sentencepiece @require_tokenizers class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = DebertaV2Tokenizer rust_tokenizer_class = DebertaV2TokenizerFast test_sentencepiece = True @@ -166,7 +165,6 @@ def test_do_lower_case_false_split_by_punct_false(self): self.assertListEqual(rust_tokens, tokens_target) def test_rust_and_python_full_tokenizers(self): - tokenizer = self.get_tokenizer() rust_tokenizer = self.get_rust_tokenizer() diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py index ece5ac333945ea..10ca3a767df72d 100644 --- a/tests/models/decision_transformer/test_modeling_decision_transformer.py +++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py @@ -132,7 +132,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (DecisionTransformerModel,) if is_torch_available() else () all_generative_model_classes = () diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 98ebac4bbbb433..c0d927b9c980d4 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class DeformableDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/deit/test_image_processing_deit.py b/tests/models/deit/test_image_processing_deit.py index d2919ccc2ab94b..4103fc8fb25d5e 100644 --- a/tests/models/deit/test_image_processing_deit.py +++ b/tests/models/deit/test_image_processing_deit.py @@ -82,7 +82,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class DeiTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DeiTImageProcessor if is_vision_available() else None test_cast_dtype = True diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index 19858cb5b7f95d..612c6d0352b10b 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -327,7 +327,6 @@ def test_problem_types(self): for problem_type in problem_types: with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - config.problem_type = problem_type["title"] config.num_labels = problem_type["num_labels"] diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py index c1d26b2fdf54fb..e6e98476657746 100644 --- a/tests/models/deta/test_image_processing_deta.py +++ b/tests/models/deta/test_image_processing_deta.py @@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class DetaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DetaImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 1638cb6794ffc3..d6354de43dbe29 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class DetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DetrImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py index 2f87f60697b2af..00352026ecf73f 100644 --- a/tests/models/dinat/test_modeling_dinat.py +++ b/tests/models/dinat/test_modeling_dinat.py @@ -193,7 +193,6 @@ def prepare_config_and_inputs_for_common(self): @require_natten @require_torch class DinatModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( DinatModel, diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py index 9b4606b484ee55..50902268fd9064 100644 --- a/tests/models/distilbert/test_modeling_distilbert.py +++ b/tests/models/distilbert/test_modeling_distilbert.py @@ -196,7 +196,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DistilBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( DistilBertModel, @@ -256,7 +255,6 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # BertForMultipleChoice behaves incorrectly in JIT environments. if model_class == DistilBertForMultipleChoice: return diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py index e0f609b4ddf309..f4481a6e4a077a 100644 --- a/tests/models/distilbert/test_modeling_flax_distilbert.py +++ b/tests/models/distilbert/test_modeling_flax_distilbert.py @@ -24,6 +24,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.models.distilbert.modeling_flax_distilbert import ( FlaxDistilBertForMaskedLM, FlaxDistilBertForMultipleChoice, @@ -111,7 +112,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxDistilBertModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( FlaxDistilBertModel, diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py index e52532d5618aae..a59401dc506b1c 100644 --- a/tests/models/distilbert/test_modeling_tf_distilbert.py +++ b/tests/models/distilbert/test_modeling_tf_distilbert.py @@ -170,7 +170,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFDistilBertModel, diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py index 7b2c97d78a0fb6..09422395720988 100644 --- a/tests/models/distilbert/test_tokenization_distilbert.py +++ b/tests/models/distilbert/test_tokenization_distilbert.py @@ -22,7 +22,6 @@ @require_tokenizers class DistilBertTokenizationTest(BertTokenizationTest): - tokenizer_class = DistilBertTokenizer rust_tokenizer_class = DistilBertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/donut/test_image_processing_donut.py b/tests/models/donut/test_image_processing_donut.py index 5ac4d1fc3d1b7f..bd992626494aa6 100644 --- a/tests/models/donut/test_image_processing_donut.py +++ b/tests/models/donut/test_image_processing_donut.py @@ -82,7 +82,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class DonutImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DonutImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py index a35a65505981f1..6a0809587c8a99 100644 --- a/tests/models/donut/test_modeling_donut_swin.py +++ b/tests/models/donut/test_modeling_donut_swin.py @@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (DonutSwinModel,) if is_torch_available() else () fx_compatible = True diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py index 708f1d53c3a46e..482caa054707c7 100644 --- a/tests/models/dpr/test_modeling_dpr.py +++ b/tests/models/dpr/test_modeling_dpr.py @@ -179,7 +179,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class DPRModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( DPRContextEncoder, diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py index 86ef3837f1fa01..4910252278fe5c 100644 --- a/tests/models/dpr/test_modeling_tf_dpr.py +++ b/tests/models/dpr/test_modeling_tf_dpr.py @@ -172,7 +172,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFDPRModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFDPRContextEncoder, diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 8ad2fea09c8bd7..db41052d4cd0e2 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -30,7 +30,6 @@ @require_tokenizers class DPRContextEncoderTokenizationTest(BertTokenizationTest): - tokenizer_class = DPRContextEncoderTokenizer rust_tokenizer_class = DPRContextEncoderTokenizerFast test_rust_tokenizer = True @@ -38,7 +37,6 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): @require_tokenizers class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): - tokenizer_class = DPRQuestionEncoderTokenizer rust_tokenizer_class = DPRQuestionEncoderTokenizerFast test_rust_tokenizer = True @@ -46,7 +44,6 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): @require_tokenizers class DPRReaderTokenizationTest(BertTokenizationTest): - tokenizer_class = DPRReaderTokenizer rust_tokenizer_class = DPRReaderTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/dpt/test_image_processing_dpt.py b/tests/models/dpt/test_image_processing_dpt.py index 4ed6faadb6a8e0..4cde4cbe73f03a 100644 --- a/tests/models/dpt/test_image_processing_dpt.py +++ b/tests/models/dpt/test_image_processing_dpt.py @@ -74,7 +74,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class DPTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = DPTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/efficientformer/test_image_processing_efficientformer.py b/tests/models/efficientformer/test_image_processing_efficientformer.py index 6a17783f61d108..ff2fcafd4b7b60 100644 --- a/tests/models/efficientformer/test_image_processing_efficientformer.py +++ b/tests/models/efficientformer/test_image_processing_efficientformer.py @@ -74,7 +74,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class EfficientFormerImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ViTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py index 4426b37e9bbf9c..4627c545ea7f2a 100644 --- a/tests/models/efficientformer/test_modeling_efficientformer.py +++ b/tests/models/efficientformer/test_modeling_efficientformer.py @@ -325,7 +325,6 @@ def test_problem_types(self): for problem_type in problem_types: with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - config.problem_type = problem_type["title"] config.num_labels = problem_type["num_labels"] diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index bdc0715f745813..7311df1e78ea3e 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -375,7 +375,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ElectraModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( ElectraModel, diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py index cd1a795a19efec..0dda4e38fddaf7 100644 --- a/tests/models/electra/test_modeling_flax_electra.py +++ b/tests/models/electra/test_modeling_flax_electra.py @@ -105,7 +105,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase): - test_head_masking = True all_model_classes = ( diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py index 0c0c4f77ab3245..a63f0087876851 100644 --- a/tests/models/electra/test_modeling_tf_electra.py +++ b/tests/models/electra/test_modeling_tf_electra.py @@ -488,7 +488,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFElectraModel, diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 8f565aec061038..c476744057e89d 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -72,7 +72,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -106,7 +106,7 @@ def check_encoder_decoder_model( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -167,7 +167,7 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: @@ -210,7 +210,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -240,7 +240,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -281,7 +281,7 @@ def check_save_and_load_encoder_decoder_model( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -327,7 +327,7 @@ def check_encoder_decoder_model_labels( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -395,7 +395,7 @@ def check_encoder_decoder_model_output_attentions( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -424,7 +424,7 @@ def check_encoder_decoder_model_output_attentions_from_config( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded @@ -491,7 +491,7 @@ def create_and_check_encoder_decoder_shared_weights( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): torch.manual_seed(0) encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py index 9d807e9f650e70..362a5f74a1b6ad 100644 --- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py @@ -69,7 +69,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -102,7 +102,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -131,7 +131,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} @@ -170,7 +170,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) # assert that model attributes match those of configs @@ -215,7 +215,7 @@ def check_encoder_decoder_model_output_attentions( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -292,7 +292,6 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,)) def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): - pt_model.to(torch_device) pt_model.eval() @@ -334,7 +333,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5) def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): - encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) @@ -346,7 +344,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict) def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): - encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = EncoderDecoderModel(encoder_decoder_config) @@ -390,7 +387,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): @is_pt_flax_cross_test def test_pt_flax_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() config = config_inputs_dict.pop("config") decoder_config = config_inputs_dict.pop("decoder_config") @@ -589,7 +585,6 @@ def get_from_encoderdecoder_pretrained_model(self): return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2") def _check_configuration_tie(self, model): - module = model.module.bind(model.params) assert id(module.decoder.config) == id(model.config.decoder) diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py index d719d6f4c48697..76ebd687f77edf 100644 --- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py @@ -78,7 +78,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -111,7 +111,7 @@ def check_encoder_decoder_model( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -160,7 +160,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -190,7 +190,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -231,7 +231,7 @@ def check_encoder_decoder_model_labels( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -298,7 +298,7 @@ def check_encoder_decoder_model_output_attentions( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -326,7 +326,7 @@ def check_encoder_decoder_model_output_attentions_from_config( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded @@ -470,7 +470,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam ) def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): - pt_inputs_dict = {} for name, key in tf_inputs_dict.items(): if type(key) == bool: @@ -490,7 +489,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): return pt_inputs_dict def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict): - pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict) # send pytorch inputs to the correct device @@ -607,7 +605,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() labels = config_inputs_dict.pop("decoder_token_labels") @@ -762,7 +759,6 @@ def prepare_config_and_inputs(self): @slow @is_pt_tf_cross_test def test_bert2bert_summarization(self): - from transformers import EncoderDecoderModel tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") @@ -863,7 +859,6 @@ def prepare_config_and_inputs(self): @slow @is_pt_tf_cross_test def test_bert2gpt2_summarization(self): - from transformers import EncoderDecoderModel tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased") @@ -1171,7 +1166,6 @@ def test_encoder_decoder_from_pretrained(self): decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: - # Since most of HF's models don't have pretrained cross-attention layers, they are randomly # initialized even if we create models using `from_pretrained` method. # For the tests, the decoder need to be a model with pretrained cross-attention layers. diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index 8db290880edd91..d6700da7c9ff99 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -166,7 +166,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class EsmModelTest(ModelTesterMixin, unittest.TestCase): - test_mismatched_shapes = False all_model_classes = ( diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py index ed307beef1ee97..a0d4232d6d8f59 100644 --- a/tests/models/esm/test_modeling_esmfold.py +++ b/tests/models/esm/test_modeling_esmfold.py @@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class EsmFoldModelTest(ModelTesterMixin, unittest.TestCase): - test_mismatched_shapes = False all_model_classes = (EsmForProteinFolding,) if is_torch_available() else () diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py index c6db0fe99f642f..0e81efd458cbfc 100644 --- a/tests/models/esm/test_modeling_tf_esm.py +++ b/tests/models/esm/test_modeling_tf_esm.py @@ -195,7 +195,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFEsmModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFEsmModel, diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py index 2cd204ebc3ddf0..b792f14e357a25 100644 --- a/tests/models/flaubert/test_modeling_flaubert.py +++ b/tests/models/flaubert/test_modeling_flaubert.py @@ -363,7 +363,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FlaubertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( FlaubertModel, @@ -439,7 +438,6 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # FlauBertForMultipleChoice behaves incorrectly in JIT environments. if model_class == FlaubertForMultipleChoice: return diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py index 09ba6f45d8d0b9..584f904ce71ddf 100644 --- a/tests/models/flaubert/test_modeling_tf_flaubert.py +++ b/tests/models/flaubert/test_modeling_tf_flaubert.py @@ -275,7 +275,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFFlaubertModel, diff --git a/tests/models/flava/test_image_processing_flava.py b/tests/models/flava/test_image_processing_flava.py index 129343b998c83f..f9751725697e0a 100644 --- a/tests/models/flava/test_image_processing_flava.py +++ b/tests/models/flava/test_image_processing_flava.py @@ -160,7 +160,6 @@ def get_expected_codebook_image_size(self): @require_torch @require_vision class FlavaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = FlavaImageProcessor if is_vision_available() else None maxDiff = None diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 44aff1025f2e61..7af98e21fd790f 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -22,8 +22,8 @@ import unittest import numpy as np - import requests + from transformers import ( FlavaConfig, FlavaImageCodebookConfig, @@ -435,7 +435,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (FlavaTextModel,) if is_torch_available() else () test_pruning = False test_head_masking = False @@ -569,7 +568,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (FlavaMultimodalModel,) if is_torch_available() else () test_pruning = False test_head_masking = False @@ -667,7 +665,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (FlavaImageCodebook,) if is_torch_available() else () test_pruning = False test_head_masking = False @@ -756,7 +753,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, ): - if text_kwargs is None: text_kwargs = {} if image_kwargs is None: diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index 5d975b061f75f3..03c1edd4ca1a3e 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -266,7 +266,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FNetModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( FNetModel, diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index 0058155bdb6d3e..17fe3e0dd30820 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -28,7 +28,6 @@ @require_sentencepiece @require_tokenizers class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = FNetTokenizer rust_tokenizer_class = FNetTokenizerFast test_rust_tokenizer = True @@ -145,7 +144,6 @@ def test_sequence_builders(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py index 7710152634ea46..e5526209bb22a9 100644 --- a/tests/models/fsmt/test_modeling_fsmt.py +++ b/tests/models/fsmt/test_modeling_fsmt.py @@ -17,8 +17,8 @@ import unittest import timeout_decorator # noqa - from parameterized import parameterized + from transformers import FSMTConfig, is_torch_available from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from transformers.utils import cached_property @@ -528,7 +528,6 @@ def test_odd_embed_dim(self): @unittest.skip("different from marian (needs more research)") def test_positional_emb_weights_against_marian(self): - desired_weights = torch.tensor( [ [0, 0, 0, 0, 0], diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py index e46928a538fdf9..6c5eb87db17ce5 100644 --- a/tests/models/funnel/test_tokenization_funnel.py +++ b/tests/models/funnel/test_tokenization_funnel.py @@ -26,7 +26,6 @@ @require_tokenizers class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = FunnelTokenizer rust_tokenizer_class = FunnelTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 40e435056b1955..b3b34f664d3c7a 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -17,6 +17,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import GitConfig, GitProcessor, GitVisionConfig, is_torch_available, is_vision_available from transformers.models.auto import get_values from transformers.testing_utils import require_torch, require_vision, slow, torch_device @@ -360,7 +361,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GitModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else () all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else () fx_compatible = False diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py index cddc80d9ae2b22..dddc2807bc046d 100644 --- a/tests/models/glpn/test_image_processing_glpn.py +++ b/tests/models/glpn/test_image_processing_glpn.py @@ -67,7 +67,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class GLPNImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = GLPNImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 7d34a7f4f30dbd..233e8ce7313bef 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GLPNModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (GLPNModel, GLPNForDepthEstimation) if is_torch_available() else () test_head_masking = False diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py index 23ff3f11b0aee5..e842bbc73268d9 100644 --- a/tests/models/gpt2/test_modeling_flax_gpt2.py +++ b/tests/models/gpt2/test_modeling_flax_gpt2.py @@ -29,6 +29,7 @@ if is_flax_available(): import jax import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -189,7 +190,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input @require_flax class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else () all_generative_model_classes = (FlaxGPT2LMHeadModel,) if is_flax_available() else () diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 2f6f8d12143d57..c9a4cd771dc621 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -430,7 +430,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification) if is_torch_available() diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py index 64cbea4de97797..298d501e586644 100644 --- a/tests/models/gpt2/test_modeling_tf_gpt2.py +++ b/tests/models/gpt2/test_modeling_tf_gpt2.py @@ -355,7 +355,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase): - all_model_classes = ( (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel) if is_tf_available() @@ -439,7 +438,6 @@ def test_onnx_runtime_optimize(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # Skip these 2 classes which uses `tf.gather` with `batch_dims=1` if model_class in [TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel]: continue diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 3273fbfce77378..17fbe51713c8ac 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -27,7 +27,6 @@ @require_tokenizers class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = GPT2Tokenizer rust_tokenizer_class = GPT2TokenizerFast test_rust_tokenizer = True diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py index 117f959831a77d..e92c9e65dfd38d 100644 --- a/tests/models/gpt2/test_tokenization_gpt2_tf.py +++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py @@ -28,7 +28,6 @@ def __init__(self, tokenizer): @tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),)) def serving(self, text): - tokenized = self.tokenizer(text) input_ids_dense = tokenized["input_ids"].to_tensor() diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py index 706b7c6cabaf3d..a32f35f6e747b1 100644 --- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py @@ -29,6 +29,7 @@ if is_flax_available(): import jax import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -181,7 +182,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input @require_flax class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else () all_generative_model_classes = (FlaxGPTNeoForCausalLM,) if is_flax_available() else () diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py index 534c29b82bd4b6..53cc45aac55119 100644 --- a/tests/models/gpt_neo/test_modeling_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -372,7 +372,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification) if is_torch_available() else () ) diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 0435624f6f1122..d57ce69f1f0fb1 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -186,7 +186,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTNeoXModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (GPTNeoXModel, GPTNeoXForCausalLM) if is_torch_available() else () all_generative_model_classes = (GPTNeoXForCausalLM,) if is_torch_available() else () test_pruning = False diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py index 32f118ba06066b..56a5f92a8322f4 100644 --- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py @@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTNeoXModelJapaneseTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM) if is_torch_available() else () all_generative_model_classes = (GPTNeoXJapaneseForCausalLM,) if is_torch_available() else () test_pruning = False diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py index 4af4da30a7b5e9..293116a24e33bb 100644 --- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -29,7 +29,6 @@ @require_tokenizers class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = GPTNeoXJapaneseTokenizer test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py index 9a6472bc92ee3f..d177e345e88eca 100644 --- a/tests/models/gptj/test_modeling_flax_gptj.py +++ b/tests/models/gptj/test_modeling_flax_gptj.py @@ -29,6 +29,7 @@ if is_flax_available(): import jax import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -178,7 +179,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input @require_flax class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else () all_generative_model_classes = (FlaxGPTJForCausalLM,) if is_flax_available() else () diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index bb20c8cee6fa73..3cdb5a4b6fb916 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering) if is_torch_available() diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py index ec6c15d3f6d640..6557428c07df7c 100644 --- a/tests/models/gptj/test_modeling_tf_gptj.py +++ b/tests/models/gptj/test_modeling_tf_gptj.py @@ -287,7 +287,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase): - all_model_classes = ( (TFGPTJForCausalLM, TFGPTJForSequenceClassification, TFGPTJForQuestionAnswering, TFGPTJModel) if is_tf_available() diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 3b396daa677eb2..cd17e4d9185186 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -22,8 +22,8 @@ import unittest import numpy as np - import requests + from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -432,7 +432,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (GroupViTTextModel,) if is_torch_available() else () test_pruning = False test_head_masking = False @@ -475,7 +474,6 @@ def test_model_from_pretrained(self): class GroupViTModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py index 45bc8b8ec3b0c8..6283ab8988d540 100644 --- a/tests/models/groupvit/test_modeling_tf_groupvit.py +++ b/tests/models/groupvit/test_modeling_tf_groupvit.py @@ -23,8 +23,8 @@ from importlib import import_module import numpy as np - import requests + from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig from transformers.testing_utils import ( is_pt_tf_cross_test, @@ -96,7 +96,6 @@ def __init__( self.seq_length = num_patches def prepare_config_and_inputs(self): - rng = random.Random(0) pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng) config = self.get_config() @@ -452,7 +451,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else () test_pruning = False test_head_masking = False diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py index 3e8d3ac6ea2993..1afea16bdd28c2 100644 --- a/tests/models/herbert/test_tokenization_herbert.py +++ b/tests/models/herbert/test_tokenization_herbert.py @@ -27,7 +27,6 @@ @require_tokenizers class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = HerbertTokenizer rust_tokenizer_class = HerbertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py index d37679831d0f8e..084a4001100cd4 100644 --- a/tests/models/hubert/test_modeling_tf_hubert.py +++ b/tests/models/hubert/test_modeling_tf_hubert.py @@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFHubertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else () test_resize_embeddings = False test_head_masking = False diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index c8ca026688b6ae..d797d6aa285a6f 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -225,7 +225,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class IBertModelTest(ModelTesterMixin, unittest.TestCase): - test_pruning = False test_torchscript = False test_head_masking = False diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py index efc456e9987735..b0a2d5ceb0a3bd 100644 --- a/tests/models/imagegpt/test_image_processing_imagegpt.py +++ b/tests/models/imagegpt/test_image_processing_imagegpt.py @@ -79,7 +79,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class ImageGPTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ImageGPTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 88e1e76c450802..c60282aff64393 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -265,7 +265,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else () ) diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index 16cacab88c8614..087c3aa27a13c7 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -220,7 +220,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( LayoutLMModel, diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py index 7bcf6e590b9857..71963e69c84df1 100644 --- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py @@ -207,7 +207,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFLayoutLMModel, diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py index 3663355ee50717..b73b2aa8e44658 100644 --- a/tests/models/layoutlm/test_tokenization_layoutlm.py +++ b/tests/models/layoutlm/test_tokenization_layoutlm.py @@ -26,7 +26,6 @@ @require_tokenizers class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = LayoutLMTokenizer rust_tokenizer_class = LayoutLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py index d2eae1d8df36a5..52bb80e14c98d5 100644 --- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py @@ -64,7 +64,6 @@ def prepare_image_processor_dict(self): @require_torch @require_pytesseract class LayoutLMv2ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None def setUp(self): diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 3c38373163e496..a4578d35345f4d 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -258,7 +258,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch @require_detectron2 class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase): - test_pruning = False test_torchscript = True test_mismatched_shapes = False diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index e48272c151fee7..9224fbd87ea49c 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -291,7 +291,6 @@ def test_add_special_tokens(self): tokenizers: List[LayoutLMv2Tokenizer] = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - special_token = "[SPECIAL_TOKEN]" special_token_box = [1000, 1000, 1000, 1000] @@ -526,7 +525,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1195,7 +1193,6 @@ def test_token_type_ids(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1283,7 +1280,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return @@ -1551,7 +1547,6 @@ def test_sequence_ids(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( @@ -1787,7 +1782,6 @@ def test_batch_encode_dynamic_overflowing(self): tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): - if is_torch_available(): returned_tensor = "pt" elif is_tf_available(): @@ -2398,7 +2392,6 @@ def test_only_label_first_subword(self): @slow def test_layoutlmv2_integration_test(self): - tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased") tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased") diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py index c61d52b65a9014..8827cdeea2335f 100644 --- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py @@ -64,7 +64,6 @@ def prepare_image_processor_dict(self): @require_torch @require_pytesseract class LayoutLMv3ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None def setUp(self): diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py index d5c8d42d22177a..601eeb58468572 100644 --- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py @@ -270,7 +270,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class LayoutLMv3ModelTest(ModelTesterMixin, unittest.TestCase): - test_pruning = False test_torchscript = False test_mismatched_shapes = False diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py index f71aeb0aefb4df..39de55efadf4d7 100644 --- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py @@ -264,7 +264,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFLayoutLMv3Model, diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 322ed6861ff693..884f87680353ac 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -171,7 +171,6 @@ def test_add_special_tokens(self): tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - special_token = "[SPECIAL_TOKEN]" special_token_box = [1000, 1000, 1000, 1000] @@ -406,7 +405,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1075,7 +1073,6 @@ def test_token_type_ids(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1161,7 +1158,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return @@ -1429,7 +1425,6 @@ def test_sequence_ids(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( @@ -1665,7 +1660,6 @@ def test_batch_encode_dynamic_overflowing(self): tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): - if is_torch_available(): returned_tensor = "pt" elif is_tf_available(): @@ -1756,7 +1750,6 @@ def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, return words, boxes, output_ids def test_added_token_with_space_before(self): - tokenizer_s = self.get_tokenizer() tokenizer_f = self.get_rust_tokenizer() @@ -2316,7 +2309,6 @@ def test_only_label_first_subword(self): @slow def test_layoutlmv3_integration_test(self): - tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base") tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base") diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index e74dfe496c1c71..bf295c9c925e0e 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -194,7 +194,6 @@ def test_add_special_tokens(self): tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - special_token = "[SPECIAL_TOKEN]" special_token_box = [1000, 1000, 1000, 1000] @@ -425,7 +424,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1098,7 +1096,6 @@ def test_token_type_ids(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - # test 1: single sequence words, boxes = self.get_words_and_boxes() @@ -1185,7 +1182,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return @@ -1448,7 +1444,6 @@ def test_sequence_ids(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( @@ -1684,7 +1679,6 @@ def test_batch_encode_dynamic_overflowing(self): tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): - if is_torch_available(): returned_tensor = "pt" elif is_tf_available(): @@ -1853,7 +1847,6 @@ def test_only_label_first_subword(self): @slow def test_layoutxlm_integration_test(self): - tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base") diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py index 2c761ad17a9a9c..7ff81749946aa4 100644 --- a/tests/models/led/test_tokenization_led.py +++ b/tests/models/led/test_tokenization_led.py @@ -128,7 +128,6 @@ def test_prepare_batch_not_longer_than_maxlen(self): @require_torch def test_special_tokens(self): - src_text = ["A long paragraph for summarization."] tgt_text = [ "Summary of the text.", diff --git a/tests/models/levit/test_image_processing_levit.py b/tests/models/levit/test_image_processing_levit.py index 8fba9a5d03f8e9..12d64c81771550 100644 --- a/tests/models/levit/test_image_processing_levit.py +++ b/tests/models/levit/test_image_processing_levit.py @@ -81,7 +81,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class LevitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = LevitImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py index 2b3436f3d05a50..1bc8eb4f6a55aa 100644 --- a/tests/models/levit/test_modeling_levit.py +++ b/tests/models/levit/test_modeling_levit.py @@ -337,7 +337,6 @@ def test_training_gradient_checkpointing(self): loss.backward() def test_problem_types(self): - parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version) if parsed_torch_version_base.base_version.startswith("1.9"): self.skipTest(reason="This test fails with PyTorch 1.9.x: some CUDA issue") @@ -362,7 +361,6 @@ def test_problem_types(self): for problem_type in problem_types: with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - config.problem_type = problem_type["title"] config.num_labels = problem_type["num_labels"] diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py index a4f189fc848a58..4eaff42684a2fc 100644 --- a/tests/models/lilt/test_modeling_lilt.py +++ b/tests/models/lilt/test_modeling_lilt.py @@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( LiltModel, diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py index 60a8ce01f4af45..ea3ba37407a34a 100644 --- a/tests/models/longformer/test_modeling_tf_longformer.py +++ b/tests/models/longformer/test_modeling_tf_longformer.py @@ -271,7 +271,6 @@ def prepare_config_and_inputs_for_question_answering(self): @require_tf class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFLongformerModel, diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py index 1ad9c1c5ce5a3b..2c262bef3092cd 100644 --- a/tests/models/longt5/test_modeling_flax_longt5.py +++ b/tests/models/longt5/test_modeling_flax_longt5.py @@ -45,6 +45,7 @@ import jax.numpy as jnp from flax.core.frozen_dict import unfreeze from flax.traverse_util import flatten_dict + from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model from transformers.models.longt5.modeling_flax_longt5 import ( @@ -82,7 +83,6 @@ def __init__( scope=None, decoder_layers=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -236,7 +236,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else () all_generative_model_classes = (FlaxLongT5ForConditionalGeneration,) if is_flax_available() else () is_encoder_decoder = True diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py index ffc67376f862ef..ee7ef89e80fe29 100644 --- a/tests/models/longt5/test_modeling_longt5.py +++ b/tests/models/longt5/test_modeling_longt5.py @@ -71,7 +71,6 @@ def __init__( decoder_layers=None, large_model_config_path="google/long-t5-local-large", ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -502,7 +501,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (LongT5Model, LongT5ForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = (LongT5ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False @@ -919,7 +917,6 @@ def __init__( scope=None, large_model_config_path="google/long-t5-local-large", ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index 789988d5ca353a..a1b0093cc4861e 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -586,7 +586,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class LukeModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( LukeModel, diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py index 1c51d02e96b714..f57a339e5c344b 100644 --- a/tests/models/lxmert/test_modeling_lxmert.py +++ b/tests/models/lxmert/test_modeling_lxmert.py @@ -129,7 +129,6 @@ def __init__( self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers} def prepare_config_and_inputs(self): - output_attentions = self.output_attentions input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size) visual_feats = torch.rand(self.batch_size, self.num_visual_features, self.visual_feat_dim, device=torch_device) @@ -412,7 +411,6 @@ def resize_lxmert_num_qa_labels( ans, output_attentions, ): - start_labels = config.num_qa_labels num_large_labels = config.num_qa_labels * 2 num_small_labels = int(config.num_qa_labels * 2) @@ -532,7 +530,6 @@ def prepare_config_and_inputs_for_common(self, return_obj_labels=False): @require_torch class LxmertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering) if is_torch_available() else () fx_compatible = True @@ -741,7 +738,6 @@ def test_retain_grad_hidden_states_attentions(self): self.assertIsNotNone(attentions_vision.grad) def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): - tf_inputs_dict = {} for key, value in pt_inputs_dict.items(): # skip key that does not exist in tf diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py index 73eda47eb9508e..4fcc0867b0c6d2 100644 --- a/tests/models/lxmert/test_modeling_tf_lxmert.py +++ b/tests/models/lxmert/test_modeling_tf_lxmert.py @@ -364,7 +364,6 @@ def create_and_check_lxmert_for_pretraining( @require_tf class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFLxmertModel, TFLxmertForPreTraining) if is_tf_available() else () test_head_masking = False test_onnx = False @@ -493,7 +492,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): pt_inputs_dict = {} for key, value in tf_inputs_dict.items(): - if isinstance(value, dict): pt_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs(value) elif isinstance(value, (list, tuple)): diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py index 76047b1f44bccb..e094427f76135c 100644 --- a/tests/models/lxmert/test_tokenization_lxmert.py +++ b/tests/models/lxmert/test_tokenization_lxmert.py @@ -26,7 +26,6 @@ @require_tokenizers class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = LxmertTokenizer rust_tokenizer_class = LxmertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index f8c5f5b7badd40..626ef294125839 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -30,7 +30,7 @@ if is_sentencepiece_available(): - from transformers.models.m2m_100.tokenization_m2m_100 import save_json, VOCAB_FILES_NAMES + from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json from ...test_tokenization_common import TokenizerTesterMixin diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py index 14d8dbac8f2de3..6510c0d732d318 100644 --- a/tests/models/marian/test_modeling_flax_marian.py +++ b/tests/models/marian/test_modeling_flax_marian.py @@ -35,6 +35,7 @@ import jax import jax.numpy as jnp + from transformers import MarianTokenizer from transformers.models.marian.modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, shift_tokens_right diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index b1e4678e4ab138..3f8be1a0a75d07 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -18,6 +18,7 @@ import unittest from huggingface_hub.hf_api import list_models + from transformers import MarianConfig, is_torch_available from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from transformers.utils import cached_property diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py index 6a079036bb6d5d..fae0edfa6896c3 100644 --- a/tests/models/marian/test_tokenization_marian.py +++ b/tests/models/marian/test_tokenization_marian.py @@ -45,7 +45,6 @@ @require_sentencepiece class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = MarianTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py index 983b4fa7e6db0c..e21c8e67709446 100644 --- a/tests/models/mask2former/test_image_processing_mask2former.py +++ b/tests/models/mask2former/test_image_processing_mask2former.py @@ -18,8 +18,8 @@ import numpy as np from datasets import load_dataset - from huggingface_hub import hf_hub_download + from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -131,7 +131,6 @@ def get_fake_mask2former_outputs(self): @require_torch @require_vision class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None def setUp(self): diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py index 3a8645f64a88f5..4d7409697e0236 100644 --- a/tests/models/mask2former/test_modeling_mask2former.py +++ b/tests/models/mask2former/test_modeling_mask2former.py @@ -173,7 +173,6 @@ def comm_check_on_output(result): @require_torch class Mask2FormerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (Mask2FormerModel, Mask2FormerForUniversalSegmentation) if is_torch_available() else () is_encoder_decoder = False diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py index 2d7b710b59da7f..694029603bf7d8 100644 --- a/tests/models/maskformer/test_image_processing_maskformer.py +++ b/tests/models/maskformer/test_image_processing_maskformer.py @@ -18,8 +18,8 @@ import numpy as np from datasets import load_dataset - from huggingface_hub import hf_hub_download + from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available @@ -131,7 +131,6 @@ def get_fake_maskformer_outputs(self): @require_torch @require_vision class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None def setUp(self): diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index 52c811591bba80..a5f53d9f6fed10 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -173,7 +173,6 @@ def comm_check_on_output(result): @require_torch class MaskFormerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (MaskFormerModel, MaskFormerForInstanceSegmentation) if is_torch_available() else () is_encoder_decoder = False diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py index 3d4de9c8e69167..3b606188a1b820 100644 --- a/tests/models/maskformer/test_modeling_maskformer_swin.py +++ b/tests/models/maskformer/test_modeling_maskformer_swin.py @@ -163,7 +163,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class MaskFormerSwinModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( MaskFormerSwinModel, diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py index 1be81583575fb9..a642b2344c9e1e 100644 --- a/tests/models/mbart/test_modeling_flax_mbart.py +++ b/tests/models/mbart/test_modeling_flax_mbart.py @@ -35,6 +35,7 @@ import jax import jax.numpy as jnp + from transformers import AutoTokenizer from transformers.models.mbart.modeling_flax_mbart import ( FlaxMBartForConditionalGeneration, diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py index e0c77ad450fde6..29b0cf899ad38a 100644 --- a/tests/models/mctct/test_feature_extraction_mctct.py +++ b/tests/models/mctct/test_feature_extraction_mctct.py @@ -104,7 +104,6 @@ def _flatten(list_of_lists): @require_torch @require_torchaudio class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None def setUp(self): diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py index 4ea3ddcb7be006..2a0c2f6ead8263 100644 --- a/tests/models/megatron_bert/test_modeling_megatron_bert.py +++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py @@ -267,7 +267,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( MegatronBertModel, diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 04301962c3cdad..45b0942e2b72ed 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -254,7 +254,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class MobileBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( MobileBertModel, diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py index 75334e2945091e..40586d20019abf 100644 --- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py @@ -42,7 +42,6 @@ @require_tf class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFMobileBertModel, diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py index 395f4a2aab2cb9..3ecc2e3238d512 100644 --- a/tests/models/mobilebert/test_tokenization_mobilebert.py +++ b/tests/models/mobilebert/test_tokenization_mobilebert.py @@ -35,7 +35,6 @@ # Copied from transformers.tests.models.bert.test_modeling_bert.py with Bert->MobileBert and pathfix @require_tokenizers class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = MobileBertTokenizer rust_tokenizer_class = MobileBertTokenizerFast test_rust_tokenizer = True @@ -312,7 +311,6 @@ def test_change_tokenize_chinese_chars(self): text_with_chinese_char = "".join(list_of_commun_chinese_char) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - kwargs["tokenize_chinese_chars"] = True tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) diff --git a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py index 34096ff1f960ab..51ca6f3b17c476 100644 --- a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py +++ b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py @@ -72,7 +72,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class MobileNetV1ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py index 472280753e9cc8..d5f148b21cd665 100644 --- a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py +++ b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py @@ -72,7 +72,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class MobileNetV2ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.py b/tests/models/mobilevit/test_image_processing_mobilevit.py index d14a405715017a..fbc72a2d9e00ce 100644 --- a/tests/models/mobilevit/test_image_processing_mobilevit.py +++ b/tests/models/mobilevit/test_image_processing_mobilevit.py @@ -75,7 +75,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class MobileViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = MobileViTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py index 1e72870fdaddf1..1be05033806942 100644 --- a/tests/models/mpnet/test_modeling_mpnet.py +++ b/tests/models/mpnet/test_modeling_mpnet.py @@ -191,7 +191,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class MPNetModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( MPNetForMaskedLM, diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py index a0a4964d57e95a..3688e106cc664d 100644 --- a/tests/models/mpnet/test_modeling_tf_mpnet.py +++ b/tests/models/mpnet/test_modeling_tf_mpnet.py @@ -185,7 +185,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFMPNetModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFMPNetForMaskedLM, diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py index f761b0280953de..e30dd3a9145e26 100644 --- a/tests/models/mpnet/test_tokenization_mpnet.py +++ b/tests/models/mpnet/test_tokenization_mpnet.py @@ -26,7 +26,6 @@ @require_tokenizers class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = MPNetTokenizer rust_tokenizer_class = MPNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/mt5/test_modeling_flax_mt5.py b/tests/models/mt5/test_modeling_flax_mt5.py index f9ef2d5e18478a..34a5731fd059f3 100644 --- a/tests/models/mt5/test_modeling_flax_mt5.py +++ b/tests/models/mt5/test_modeling_flax_mt5.py @@ -21,6 +21,7 @@ if is_flax_available(): import optax from flax.training.common_utils import onehot + from transformers import AutoTokenizer, FlaxMT5ForConditionalGeneration from transformers.models.t5.modeling_flax_t5 import shift_tokens_right diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py index 71e83fba0e16ff..8bddb8443b642f 100644 --- a/tests/models/mvp/test_tokenization_mvp.py +++ b/tests/models/mvp/test_tokenization_mvp.py @@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self): @require_torch def test_special_tokens(self): - src_text = ["A long paragraph for summarization."] tgt_text = [ "Summary of the text.", diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py index b89d4c1bb75a8b..1c74fe84b044e7 100644 --- a/tests/models/nat/test_modeling_nat.py +++ b/tests/models/nat/test_modeling_nat.py @@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self): @require_natten @require_torch class NatModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( NatModel, diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py index 6c91d8e7fb18a5..31eb9e1e9b11b9 100644 --- a/tests/models/nezha/test_modeling_nezha.py +++ b/tests/models/nezha/test_modeling_nezha.py @@ -316,7 +316,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( NezhaModel, @@ -429,7 +428,6 @@ def test_model_from_pretrained(self): def test_torchscript_device_change(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - # NezhaForMultipleChoice behaves incorrectly in JIT environments. if model_class == NezhaForMultipleChoice: return diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index d77b101fa76608..01d0420180c8ac 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -258,7 +258,6 @@ def test_save_slow_from_fast_and_reload_fast(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py index b93c074bf68377..43879571064996 100644 --- a/tests/models/nystromformer/test_modeling_nystromformer.py +++ b/tests/models/nystromformer/test_modeling_nystromformer.py @@ -217,7 +217,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class NystromformerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( NystromformerModel, diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py index ac12447a36341e..8dec5c407083b9 100644 --- a/tests/models/oneformer/test_image_processing_oneformer.py +++ b/tests/models/oneformer/test_image_processing_oneformer.py @@ -18,8 +18,8 @@ import unittest import numpy as np - from huggingface_hub import hf_hub_download + from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py index 5056d682832d95..5ce677cba6c6f5 100644 --- a/tests/models/oneformer/test_processor_oneformer.py +++ b/tests/models/oneformer/test_processor_oneformer.py @@ -21,8 +21,8 @@ import numpy as np from datasets import load_dataset - from huggingface_hub import hf_hub_download + from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision from transformers.utils import is_torch_available, is_vision_available diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py index 6c91808421f4b8..e525c297410797 100644 --- a/tests/models/openai/test_modeling_openai.py +++ b/tests/models/openai/test_modeling_openai.py @@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTForSequenceClassification) if is_torch_available() diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py index 7cdc2a8bb1879b..3ce67e058a469d 100644 --- a/tests/models/openai/test_modeling_tf_openai.py +++ b/tests/models/openai/test_modeling_tf_openai.py @@ -192,7 +192,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TFOpenAIGPTForSequenceClassification) if is_tf_available() diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py index 04d1f75db1c74a..ef94633f22a87e 100644 --- a/tests/models/opt/test_modeling_flax_opt.py +++ b/tests/models/opt/test_modeling_flax_opt.py @@ -33,6 +33,7 @@ import jax import jax.numpy as jnp + from transformers import FlaxOPTForCausalLM, FlaxOPTModel, GPT2Tokenizer diff --git a/tests/models/owlvit/test_image_processing_owlvit.py b/tests/models/owlvit/test_image_processing_owlvit.py index b94120f563da1a..5a0afa382652eb 100644 --- a/tests/models/owlvit/test_image_processing_owlvit.py +++ b/tests/models/owlvit/test_image_processing_owlvit.py @@ -82,7 +82,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class OwlViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = OwlViTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index 957533980151ad..bd83d7e682961b 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -21,8 +21,8 @@ import unittest import numpy as np - import requests + from transformers import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -293,7 +293,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (OwlViTTextModel,) if is_torch_available() else () fx_compatible = False test_pruning = False @@ -339,7 +338,6 @@ def test_model_from_pretrained(self): class OwlViTModelTester: def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py index 61c356bfb0ced3..fbc49c78112bf9 100644 --- a/tests/models/pegasus/test_modeling_flax_pegasus.py +++ b/tests/models/pegasus/test_modeling_flax_pegasus.py @@ -30,10 +30,10 @@ # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform" - import numpy as np - import jax import jax.numpy as jnp + import numpy as np + from transformers import FlaxPegasusForConditionalGeneration, FlaxPegasusModel diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index de2886a5e12081..8f554a411e7d12 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -27,7 +27,6 @@ @require_sentencepiece @require_tokenizers class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True @@ -134,7 +133,6 @@ def test_tokenizer_integration(self): @require_sentencepiece @require_tokenizers class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py index 5f69b9ff69eec0..f07b8746768ee1 100644 --- a/tests/models/perceiver/test_modeling_perceiver.py +++ b/tests/models/perceiver/test_modeling_perceiver.py @@ -263,7 +263,6 @@ def prepare_config_and_inputs_for_model_class(self, model_class): @require_torch class PerceiverModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( PerceiverModel, @@ -739,7 +738,6 @@ def test_problem_types(self): for problem_type in problem_types: with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - config.problem_type = problem_type["title"] config.num_labels = problem_type["num_labels"] @@ -849,7 +847,6 @@ def extract_image_patches(x, kernel, stride=1, dilation=1): class PerceiverModelIntegrationTest(unittest.TestCase): @slow def test_inference_masked_lm(self): - tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver") model.to(torch_device) @@ -884,7 +881,6 @@ def test_inference_masked_lm(self): @slow def test_inference_image_classification(self): - feature_extractor = PerceiverFeatureExtractor() model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned") model.to(torch_device) @@ -909,7 +905,6 @@ def test_inference_image_classification(self): @slow def test_inference_image_classification_fourier(self): - feature_extractor = PerceiverFeatureExtractor() model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier") model.to(torch_device) @@ -934,7 +929,6 @@ def test_inference_image_classification_fourier(self): @slow def test_inference_image_classification_conv(self): - feature_extractor = PerceiverFeatureExtractor() model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv") model.to(torch_device) diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index 3c7a67bcd2b916..197ab6d5bfa209 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -36,7 +36,6 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = PerceiverTokenizer test_rust_tokenizer = False diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py index de16c154c92524..6624957531b07c 100644 --- a/tests/models/phobert/test_tokenization_phobert.py +++ b/tests/models/phobert/test_tokenization_phobert.py @@ -22,7 +22,6 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = PhobertTokenizer test_rust_tokenizer = False diff --git a/tests/models/poolformer/test_image_processing_poolformer.py b/tests/models/poolformer/test_image_processing_poolformer.py index d5596a55a9f258..b6078fb5c5a424 100644 --- a/tests/models/poolformer/test_image_processing_poolformer.py +++ b/tests/models/poolformer/test_image_processing_poolformer.py @@ -78,7 +78,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class PoolFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = PoolFormerImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py index 9bb8fa2e29cd34..a7869cbf7477a2 100644 --- a/tests/models/poolformer/test_modeling_poolformer.py +++ b/tests/models/poolformer/test_modeling_poolformer.py @@ -122,7 +122,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (PoolFormerModel, PoolFormerForImageClassification) if is_torch_available() else () test_head_masking = False diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py index 9258d797884b0b..86cc8f3896e376 100644 --- a/tests/models/prophetnet/test_modeling_prophetnet.py +++ b/tests/models/prophetnet/test_modeling_prophetnet.py @@ -70,7 +70,6 @@ def __init__( disable_ngram_loss=False, scope=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py index 8d95eb310025d1..cf4317b3a669b6 100644 --- a/tests/models/prophetnet/test_tokenization_prophetnet.py +++ b/tests/models/prophetnet/test_tokenization_prophetnet.py @@ -32,7 +32,6 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = ProphetNetTokenizer test_rust_tokenizer = False diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py index 82bf5e3e336457..66e864a6be8d02 100644 --- a/tests/models/qdqbert/test_modeling_qdqbert.py +++ b/tests/models/qdqbert/test_modeling_qdqbert.py @@ -420,7 +420,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch @require_pytorch_quantization class QDQBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( QDQBertModel, diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py index 80819663a107d1..48c7099620f308 100644 --- a/tests/models/rag/test_modeling_rag.py +++ b/tests/models/rag/test_modeling_rag.py @@ -48,10 +48,10 @@ T5_SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") if is_torch_available() and is_datasets_available() and is_faiss_available(): + import faiss import torch from datasets import Dataset - import faiss from transformers import ( AutoConfig, AutoModel, @@ -99,7 +99,6 @@ def require_retrieval(test_case): @require_retrieval @require_sentencepiece class RagTestMixin: - all_model_classes = ( (RagModel, RagTokenForGeneration, RagSequenceForGeneration) if is_torch_available() and is_datasets_available() and is_faiss_available() @@ -493,7 +492,7 @@ def check_model_with_mismatch_n_docs_value( decoder_attention_mask, retriever_n_docs, generator_n_docs, - **kwargs + **kwargs, ): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py index 314ce099baf65f..541d7cd132714e 100644 --- a/tests/models/rag/test_modeling_tf_rag.py +++ b/tests/models/rag/test_modeling_tf_rag.py @@ -16,9 +16,9 @@ if is_tf_available() and is_datasets_available() and is_faiss_available(): + import faiss import tensorflow as tf from datasets import Dataset - import faiss from transformers import ( AutoConfig, @@ -31,7 +31,6 @@ TFRagSequenceForGeneration, TFRagTokenForGeneration, ) - from transformers.modeling_tf_outputs import TFBaseModelOutput from ..bart.test_modeling_tf_bart import TFBartModelTester @@ -58,7 +57,6 @@ def require_retrieval(test_case): @require_retrieval @require_sentencepiece class TFRagTestMixin: - all_model_classes = ( (TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration) if is_tf_available() and is_datasets_available() and is_faiss_available() @@ -392,7 +390,7 @@ def check_model_with_mismatch_n_docs_value( decoder_attention_mask, retriever_n_docs, generator_n_docs, - **kwargs + **kwargs, ): self.assertIsNotNone(config.question_encoder) self.assertIsNotNone(config.generator) diff --git a/tests/models/rag/test_retrieval_rag.py b/tests/models/rag/test_retrieval_rag.py index c6c1e11360f8b5..d4c119815c96f4 100644 --- a/tests/models/rag/test_retrieval_rag.py +++ b/tests/models/rag/test_retrieval_rag.py @@ -360,7 +360,6 @@ def test_hf_index_retriever_call(self): @require_tokenizers @require_sentencepiece def test_custom_hf_index_end2end_retriever_call(self): - context_encoder_tokenizer = self.get_dpr_ctx_encoder_tokenizer() n_docs = 1 retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False) diff --git a/tests/models/rag/test_tokenization_rag.py b/tests/models/rag/test_tokenization_rag.py index ae9909248471be..3ac5b0efe02ea6 100644 --- a/tests/models/rag/test_tokenization_rag.py +++ b/tests/models/rag/test_tokenization_rag.py @@ -110,7 +110,6 @@ def tearDown(self): @require_tokenizers def test_save_load_pretrained_with_saved_config(self): - save_dir = os.path.join(self.tmpdirname, "rag_tokenizer") rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict()) rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer()) diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py index c087a4dd0d3844..3fa993285dcecc 100644 --- a/tests/models/realm/test_modeling_realm.py +++ b/tests/models/realm/test_modeling_realm.py @@ -304,7 +304,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class RealmModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( RealmEmbedder, diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py index 2a065ceee66af6..6a5a3878fd4354 100644 --- a/tests/models/realm/test_tokenization_realm.py +++ b/tests/models/realm/test_tokenization_realm.py @@ -33,7 +33,6 @@ @require_tokenizers class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = RealmTokenizer rust_tokenizer_class = RealmTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py index 37ea66847f2d0c..a2a0db6c370580 100644 --- a/tests/models/reformer/test_tokenization_reformer.py +++ b/tests/models/reformer/test_tokenization_reformer.py @@ -27,7 +27,6 @@ @require_sentencepiece @require_tokenizers class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = ReformerTokenizer rust_tokenizer_class = ReformerTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index a3ffd6dfd5a164..b431aff86e0983 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class RemBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( RemBertModel, diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py index 6d4cf0a523b933..11a7b54497b98b 100644 --- a/tests/models/rembert/test_modeling_tf_rembert.py +++ b/tests/models/rembert/test_modeling_tf_rembert.py @@ -571,7 +571,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFRemBertModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFRemBertModel, diff --git a/tests/models/retribert/test_tokenization_retribert.py b/tests/models/retribert/test_tokenization_retribert.py index e2bf4e61b1ac09..25b3df6f3e34ea 100644 --- a/tests/models/retribert/test_tokenization_retribert.py +++ b/tests/models/retribert/test_tokenization_retribert.py @@ -35,7 +35,6 @@ # Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert @require_tokenizers class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = RetriBertTokenizer test_slow_tokenizer = True rust_tokenizer_class = RetriBertTokenizerFast @@ -307,7 +306,6 @@ def test_change_tokenize_chinese_chars(self): text_with_chinese_char = "".join(list_of_commun_chinese_char) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - kwargs["tokenize_chinese_chars"] = True tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) @@ -352,7 +350,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py index 5bd8a56022ce8c..c325e295f610d1 100644 --- a/tests/models/roberta/test_modeling_flax_roberta.py +++ b/tests/models/roberta/test_modeling_flax_roberta.py @@ -132,7 +132,6 @@ def prepare_config_and_inputs_for_decoder(self): @require_flax class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase): - test_head_masking = True all_model_classes = ( diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 5128789d41a5d7..f9df989d9bb211 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -367,7 +367,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( RobertaForCausalLM, diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py index f9408b84171d3c..28166171e11678 100644 --- a/tests/models/roberta/test_modeling_tf_roberta.py +++ b/tests/models/roberta/test_modeling_tf_roberta.py @@ -548,7 +548,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFRobertaModel, diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py index 357c05fb02a13f..3f15ca9ff3af1b 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py @@ -24,6 +24,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.models.roberta_prelayernorm.modeling_flax_roberta_prelayernorm import ( FlaxRobertaPreLayerNormForCausalLM, FlaxRobertaPreLayerNormForMaskedLM, @@ -135,7 +136,6 @@ def prepare_config_and_inputs_for_decoder(self): @require_flax # Copied from tests.models.roberta.test_modelling_flax_roberta.FlaxRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta-base->andreasmadsen/efficient_mlm_m0.40 class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase): - test_head_masking = True all_model_classes = ( diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 971f87c4ee634b..d32a5e959d7b9e 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -366,7 +366,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch # Copied from tests.models.roberta.test_modelling_roberta.RobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( RobertaPreLayerNormForCausalLM, diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py index a7263218709e73..1580f27e6f0863 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py @@ -550,7 +550,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf # Copied from tests.models.roberta.test_modelling_tf_roberta.TFRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFRobertaPreLayerNormModel, diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py index d45c08efdbb394..28d0ffba95a407 100644 --- a/tests/models/roformer/test_modeling_flax_roformer.py +++ b/tests/models/roformer/test_modeling_flax_roformer.py @@ -24,6 +24,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers.models.roformer.modeling_flax_roformer import ( FlaxRoFormerForMaskedLM, FlaxRoFormerForMultipleChoice, @@ -116,7 +117,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxRoFormerModelTest(FlaxModelTesterMixin, unittest.TestCase): - test_head_masking = True all_model_classes = ( diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index dadb0d8e747b6b..acd6ec885bd55e 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class RoFormerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( RoFormerModel, @@ -491,7 +490,6 @@ def test_basic(self): ) def test_positional_emb_weights_against_roformer(self): - desired_weights = torch.tensor( [ [0.0000, 0.0000, 0.0000, 0.0000, 0.0000], diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py index d32d30ae8ad92c..a0fba3a6806f81 100644 --- a/tests/models/roformer/test_modeling_tf_roformer.py +++ b/tests/models/roformer/test_modeling_tf_roformer.py @@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFRoFormerModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFRoFormerModel, @@ -344,7 +343,6 @@ def test_basic(self): tf.debugging.assert_near(emb, desired_weights, atol=self.tolerance) def test_positional_emb_weights_against_roformer(self): - desired_weights = tf.constant( [ [0.0000, 0.0000, 0.0000, 0.0000, 0.0000], diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 7546bc2e41ddd9..2d674100f02bfc 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -24,7 +24,6 @@ @require_rjieba @require_tokenizers class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = RoFormerTokenizer rust_tokenizer_class = RoFormerTokenizerFast space_between_special_tokens = True diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py index 2be0228d65a5f6..5559fae83b082d 100644 --- a/tests/models/segformer/test_image_processing_segformer.py +++ b/tests/models/segformer/test_image_processing_segformer.py @@ -96,7 +96,6 @@ def prepare_semantic_batch_inputs(): @require_torch @require_vision class SegformerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = SegformerImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py index 6037170fb1f345..57513800d6ffee 100644 --- a/tests/models/segformer/test_modeling_segformer.py +++ b/tests/models/segformer/test_modeling_segformer.py @@ -160,7 +160,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class SegformerModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( SegformerModel, diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py index dfdb24f37b17ee..bfcc580bb4cc2d 100644 --- a/tests/models/segformer/test_modeling_tf_segformer.py +++ b/tests/models/segformer/test_modeling_tf_segformer.py @@ -288,7 +288,6 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py index c42da75bf635b7..f2c75e702bf765 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py @@ -33,6 +33,7 @@ import jax.numpy as jnp from flax.training.common_utils import onehot from flax.traverse_util import flatten_dict + from transformers import ( FlaxBartForCausalLM, FlaxBertForCausalLM, @@ -73,7 +74,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -103,7 +104,7 @@ def check_encoder_decoder_model( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -142,7 +143,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -169,7 +170,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} @@ -208,7 +209,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) # assert that loading encoder and decoder models from configs has been correctly executed @@ -253,7 +254,7 @@ def check_encoder_decoder_model_output_attentions( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -336,7 +337,7 @@ def check_freeze_feature_encoder( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config) @@ -406,7 +407,6 @@ def compute_loss( self.assertTrue((grad == grad_frozen).all()) def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): - pt_model.to(torch_device) pt_model.eval() @@ -448,7 +448,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5) def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): - encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = SpeechEncoderDecoderModel(encoder_decoder_config) @@ -460,7 +459,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict) def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): - encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = SpeechEncoderDecoderModel(encoder_decoder_config) @@ -508,7 +506,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): @is_pt_flax_cross_test def test_pt_flax_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() config = config_inputs_dict.pop("config") decoder_config = config_inputs_dict.pop("decoder_config") diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py index 3ecca17324a34e..368232331a2ac0 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py @@ -62,7 +62,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_attention_mask, input_values=None, input_features=None, - **kwargs + **kwargs, ): encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -95,7 +95,7 @@ def check_encoder_decoder_model( decoder_attention_mask, input_values=None, input_features=None, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -135,7 +135,7 @@ def check_encoder_decoder_model_with_inputs( decoder_attention_mask, input_values=None, input_features=None, - **kwargs + **kwargs, ): inputs = input_values if input_features is None else input_features encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) @@ -173,7 +173,7 @@ def check_encoder_decoder_model_from_pretrained( return_dict, input_values=None, input_features=None, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -202,7 +202,7 @@ def check_save_and_load( decoder_attention_mask, input_values=None, input_features=None, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -245,7 +245,7 @@ def check_save_and_load_encoder_decoder_model( decoder_attention_mask, input_values=None, input_features=None, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -292,7 +292,7 @@ def check_encoder_decoder_model_output_attentions( labels=None, input_values=None, input_features=None, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 244b748c7139ff..749323d3a8c9c6 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -104,7 +104,6 @@ def _flatten(list_of_lists): @require_torch @require_torchaudio class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None def setUp(self): diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index 2720f531af9179..34cf071bd11aa6 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -149,7 +149,6 @@ def prepare_inputs_for_target(self, equal_length=False, numpify=False): @require_torch @require_torchaudio class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = SpeechT5FeatureExtractor if is_speech_available() else None def setUp(self): diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index f064611b6a9e98..14dce7e9757854 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class SplinterModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( SplinterModel, @@ -338,7 +337,6 @@ def test_multi_gpu_data_parallel_forward(self): inputs_dict[k] = v.to(0) for model_class in self.all_model_classes: - # Skip this case since it will fail sometimes, as described above. if model_class == SplinterForPreTraining: continue diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py index cffc4570a05918..8b8187cbc3f3a6 100644 --- a/tests/models/squeezebert/test_modeling_squeezebert.py +++ b/tests/models/squeezebert/test_modeling_squeezebert.py @@ -215,7 +215,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( SqueezeBertModel, diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py index 88d715bcc1409f..a65862556405e8 100644 --- a/tests/models/squeezebert/test_tokenization_squeezebert.py +++ b/tests/models/squeezebert/test_tokenization_squeezebert.py @@ -22,7 +22,6 @@ @require_tokenizers class SqueezeBertTokenizationTest(BertTokenizationTest): - tokenizer_class = SqueezeBertTokenizer rust_tokenizer_class = SqueezeBertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index 0b780d74b5b3f2..a56ba40978e2c0 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class SwinModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( SwinModel, diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py index be5861ce48b4a8..bc2e957c433f7d 100644 --- a/tests/models/swin/test_modeling_tf_swin.py +++ b/tests/models/swin/test_modeling_tf_swin.py @@ -177,7 +177,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFSwinModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFSwinModel, @@ -335,7 +334,6 @@ def check_hidden_states_output(self, inputs_dict, config, model_class, image_siz ) def test_hidden_states_output(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() image_size = to_2tuple(self.model_tester.image_size) diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.py b/tests/models/swin2sr/test_image_processing_swin2sr.py index 06e9539693e1bc..1cb19387ffb974 100644 --- a/tests/models/swin2sr/test_image_processing_swin2sr.py +++ b/tests/models/swin2sr/test_image_processing_swin2sr.py @@ -101,7 +101,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False): @require_torch @require_vision class Swin2SRImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = Swin2SRImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py index 5bd54d7a79f9b9..556decee8f4796 100644 --- a/tests/models/swin2sr/test_modeling_swin2sr.py +++ b/tests/models/swin2sr/test_modeling_swin2sr.py @@ -156,7 +156,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Swin2SRModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_torch_available() else () fx_compatible = False diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py index 13a39b139c81aa..b0c689cfde3ddd 100644 --- a/tests/models/swinv2/test_modeling_swinv2.py +++ b/tests/models/swinv2/test_modeling_swinv2.py @@ -171,7 +171,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Swinv2ModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( (Swinv2Model, Swinv2ForImageClassification, Swinv2ForMaskedImageModeling) if is_torch_available() else () ) diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index 20e089a3133f4e..283fd1710168df 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -73,7 +73,6 @@ def __init__( expert_capacity=100, router_jitter_noise=0.0, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -548,7 +547,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( (SwitchTransformersModel, SwitchTransformersForConditionalGeneration) if is_torch_available() else () ) @@ -828,7 +826,6 @@ def __init__( pad_token_id=0, scope=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py index 10e6622bb7df14..a2a80ab25bb662 100644 --- a/tests/models/t5/test_modeling_flax_t5.py +++ b/tests/models/t5/test_modeling_flax_t5.py @@ -46,6 +46,7 @@ from flax.core.frozen_dict import unfreeze from flax.training.common_utils import onehot from flax.traverse_util import flatten_dict + from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model from transformers.models.t5.modeling_flax_t5 import ( @@ -81,7 +82,6 @@ def __init__( scope=None, decoder_layers=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -228,7 +228,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else () all_generative_model_classes = (FlaxT5ForConditionalGeneration,) if is_flax_available() else () is_encoder_decoder = True @@ -489,7 +488,6 @@ def __init__( decoder_start_token_id=0, scope=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -576,7 +574,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxT5EncoderOnlyModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = (FlaxT5EncoderModel,) if is_flax_available() else () is_encoder_decoder = False diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index fe3ce7597bfee5..c6c0ede07121b8 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -73,7 +73,6 @@ def __init__( scope=None, decoder_layers=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -520,7 +519,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else () all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else () @@ -703,7 +701,6 @@ def __init__( pad_token_id=0, scope=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py index 57c991f9f15afc..767f9def7b90bd 100644 --- a/tests/models/t5/test_modeling_tf_t5.py +++ b/tests/models/t5/test_modeling_tf_t5.py @@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase): - is_encoder_decoder = True all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else () all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else () @@ -346,7 +345,6 @@ def __init__( pad_token_id=0, scope=None, ): - self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index eb429f750ab9cf..8dbef672972ea6 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -38,7 +38,6 @@ @require_sentencepiece @require_tokenizers class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_rust_tokenizer = True @@ -272,7 +271,6 @@ def test_fast_and_slow_same_result(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [f"" for i in range(100)] + [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( @@ -306,7 +304,6 @@ def test_special_tokens_initialization_with_non_empty_additional_special_tokens( tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) for tokenizer_class, tokenizer_utils in tokenizer_list: - with tempfile.TemporaryDirectory() as tmp_dir: tokenizer_utils.save_pretrained(tmp_dir) diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py index 34bcacefe9ab97..06fbe6f8e2b2b5 100644 --- a/tests/models/table_transformer/test_modeling_table_transformer.py +++ b/tests/models/table_transformer/test_modeling_table_transformer.py @@ -20,6 +20,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import TableTransformerConfig, is_timm_available, is_vision_available from transformers.testing_utils import require_timm, require_vision, slow, torch_device diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py index 504f3e278ea888..1448c9e33686c3 100644 --- a/tests/models/tapas/test_modeling_tapas.py +++ b/tests/models/tapas/test_modeling_tapas.py @@ -409,7 +409,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class TapasModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TapasModel, diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py index 2f49b57445baaf..ebb4d0d8bf0155 100644 --- a/tests/models/tapas/test_modeling_tf_tapas.py +++ b/tests/models/tapas/test_modeling_tf_tapas.py @@ -419,7 +419,6 @@ def prepare_config_and_inputs_for_common(self): @require_tensorflow_probability @require_tf class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFTapasModel, diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 89865a78e73394..d3af3276f751b9 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -90,7 +90,6 @@ def get_clean_sequence( add_special_tokens: bool = True, return_table_and_query: bool = False, ): - toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))] if empty_table: @@ -635,7 +634,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - table, query = self.get_table_and_query(tokenizer) sequences = tokenizer.encode(table, query, add_special_tokens=False) @@ -1040,7 +1038,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py index dec0f507ed3c04..9bc61acb7f0d2e 100644 --- a/tests/models/tapex/test_tokenization_tapex.py +++ b/tests/models/tapex/test_tokenization_tapex.py @@ -84,7 +84,6 @@ def get_clean_sequence( add_special_tokens: bool = True, return_table_and_query: bool = False, ): - toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))] if empty_table: @@ -364,7 +363,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - table, query = self.get_table_and_query(tokenizer) sequences = tokenizer.encode(table, query, add_special_tokens=False) diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py index a3973a39edd887..1f64d381b1d794 100644 --- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py +++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py @@ -19,6 +19,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import is_torch_available from transformers.testing_utils import is_flaky, require_torch, slow, torch_device diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py index 8f95b1d0189d86..5a9e039e7fd6a5 100644 --- a/tests/models/timesformer/test_modeling_timesformer.py +++ b/tests/models/timesformer/test_modeling_timesformer.py @@ -20,8 +20,8 @@ import unittest import numpy as np - from huggingface_hub import hf_hub_download + from transformers import TimesformerConfig from transformers.models.auto import get_values from transformers.testing_utils import require_torch, require_vision, slow, torch_device diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py index 5bdb52450dbfaa..07c898bf6fda01 100644 --- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py +++ b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py @@ -95,7 +95,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else () # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py index 84e25d8716f5f8..c0051eab2dd855 100644 --- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py +++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py @@ -156,7 +156,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( (TFTransfoXLModel, TFTransfoXLLMHeadModel, TFTransfoXLForSequenceClassification) if is_tf_available() else () ) diff --git a/tests/models/transfo_xl/test_tokenization_transfo_xl.py b/tests/models/transfo_xl/test_tokenization_transfo_xl.py index 3f7065c51b4739..15b712ff3784e3 100644 --- a/tests/models/transfo_xl/test_tokenization_transfo_xl.py +++ b/tests/models/transfo_xl/test_tokenization_transfo_xl.py @@ -23,7 +23,6 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = TransfoXLTokenizer test_rust_tokenizer = False test_seq2seq = False diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 228b0dd175f86a..1f108176959b70 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -546,7 +546,6 @@ def _load_datasamples(self, num_samples): return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): - ds = load_dataset("anton-l/superb_dummy", task, split="test") return ds[:num_samples] diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py index f3982a79df79a9..f2ab1cb1bde2e5 100644 --- a/tests/models/upernet/test_modeling_upernet.py +++ b/tests/models/upernet/test_modeling_upernet.py @@ -19,6 +19,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import ConvNextConfig, UperNetConfig from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py index 53676328450bc5..70980c195cff33 100644 --- a/tests/models/videomae/test_image_processing_videomae.py +++ b/tests/models/videomae/test_image_processing_videomae.py @@ -81,7 +81,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class VideoMAEImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = VideoMAEImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index bc665410b6c6b9..63e6f19ffca297 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -20,8 +20,8 @@ import unittest import numpy as np - from huggingface_hub import hf_hub_download + from transformers import VideoMAEConfig from transformers.models.auto import get_values from transformers.testing_utils import require_torch, require_vision, slow, torch_device diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py index a89fd9b854d1af..f33492d1020578 100644 --- a/tests/models/vilt/test_image_processing_vilt.py +++ b/tests/models/vilt/test_image_processing_vilt.py @@ -117,7 +117,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class ViltImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ViltImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py index aaaf62c5a0b464..c4f84012c73a7b 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py @@ -70,7 +70,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -100,7 +100,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -126,7 +126,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model} @@ -162,7 +162,7 @@ def check_encoder_decoder_model_output_attentions( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -236,7 +236,6 @@ def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_con self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): - pt_model.to(torch_device) pt_model.eval() @@ -278,7 +277,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5) def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): - encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = VisionEncoderDecoderModel(encoder_decoder_config) @@ -290,7 +288,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict): self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict) def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict): - encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) pt_model = VisionEncoderDecoderModel(encoder_decoder_config) @@ -330,7 +327,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): @is_pt_flax_cross_test def test_pt_flax_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() config = config_inputs_dict.pop("config") decoder_config = config_inputs_dict.pop("decoder_config") @@ -442,7 +438,6 @@ def get_from_encoderdecoder_pretrained_model(self): ) def _check_configuration_tie(self, model): - module = model.module.bind(model.params) assert id(module.decoder.config) == id(model.config.decoder) @@ -465,7 +460,6 @@ def prepare_img(): class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase): @slow def test_inference_coco_en(self): - loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) @@ -501,7 +495,6 @@ def test_inference_coco_en(self): self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): - outputs = model.generate(pixel_values, max_length=16, num_beams=4) output_ids = outputs.sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py index ada036e2aa543e..498f22b17e4f85 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py @@ -84,7 +84,7 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) self.assertTrue(encoder_decoder_config.decoder.is_decoder) @@ -114,7 +114,7 @@ def check_encoder_decoder_model( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -158,7 +158,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_input_ids, decoder_attention_mask, return_dict, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -185,7 +185,7 @@ def check_save_and_load( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -223,7 +223,7 @@ def check_encoder_decoder_model_labels( decoder_input_ids, decoder_attention_mask, labels, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model) @@ -253,7 +253,7 @@ def check_encoder_decoder_model_output_attentions( decoder_config, decoder_input_ids, decoder_attention_mask, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -403,7 +403,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam ) def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): - pt_inputs_dict = {} for name, key in tf_inputs_dict.items(): if type(key) == bool: @@ -423,7 +422,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): return pt_inputs_dict def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict): - pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict) # send pytorch inputs to the correct device @@ -463,7 +461,6 @@ def check_pt_tf_equivalence(self, tf_model, pt_model, tf_inputs_dict): self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict) def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict): - encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Output all for aggressive testing encoder_decoder_config.output_hidden_states = True @@ -479,7 +476,6 @@ def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict): self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict) def check_tf_to_pt_equivalence(self, config, decoder_config, tf_inputs_dict): - encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config) # Output all for aggressive testing encoder_decoder_config.output_hidden_states = True @@ -534,7 +530,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): @is_pt_tf_cross_test def test_pt_tf_model_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() labels = config_inputs_dict.pop("decoder_token_labels") @@ -839,7 +834,6 @@ def test_encoder_decoder_from_pretrained(self): decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: - # Since most of HF's models don't have pretrained cross-attention layers, they are randomly # initialized even if we create models using `from_pretrained` method. # For the tests, the decoder need to be a model with pretrained cross-attention layers. @@ -895,7 +889,6 @@ def test_encoder_decoder_from_pretrained(self): class TFViT2GPT2ModelIntegrationTest(unittest.TestCase): @slow def test_inference_coco_en(self): - loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index 6228cb51fd5ac4..52b23195b1db06 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -135,7 +135,7 @@ def check_encoder_decoder_model_from_pretrained( decoder_attention_mask, return_dict, pixel_values=None, - **kwargs + **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict} @@ -226,7 +226,7 @@ def check_encoder_decoder_model_output_attentions( decoder_attention_mask, labels=None, pixel_values=None, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -402,7 +402,7 @@ def check_encoder_decoder_model_output_attentions( decoder_attention_mask, labels=None, pixel_values=None, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -590,7 +590,7 @@ def check_encoder_decoder_model_output_attentions( decoder_attention_mask, labels=None, pixel_values=None, - **kwargs + **kwargs, ): # make the decoder inputs a different shape from the encoder inputs to harden the test decoder_input_ids = decoder_input_ids[:, :-1] @@ -747,7 +747,6 @@ def test_inference_printed(self): class ViT2GPT2ModelIntegrationTest(unittest.TestCase): @slow def test_inference_coco_en(self): - loc = "ydshieh/vit-gpt2-coco-en" feature_extractor = ViTFeatureExtractor.from_pretrained(loc) @@ -787,7 +786,6 @@ def test_inference_coco_en(self): self.assertLessEqual(max_diff, 1e-4) def generate_step(pixel_values): - outputs = model.generate( pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True ) diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py index cb476c128aa685..1cfa04e67f3994 100644 --- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py +++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py @@ -100,7 +100,6 @@ def check_model_from_pretrained_configs( def check_vision_text_dual_encoder_from_pretrained( self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs ): - vision_model, text_model = self.get_vision_text_model(vision_config, text_config) kwargs = {"vision_model": vision_model, "text_model": text_model} model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs) @@ -157,7 +156,6 @@ def check_vision_text_output_attention( ) def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): - pt_model.to(torch_device) pt_model.eval() @@ -199,7 +197,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict): self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2) def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict): - config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config) pt_model = VisionTextDualEncoderModel(config) @@ -211,7 +208,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict): self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict) def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict): - config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config) pt_model = VisionTextDualEncoderModel(config) @@ -239,7 +235,6 @@ def test_vision_text_output_attention(self): @is_pt_flax_cross_test def test_pt_flax_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() vision_config = config_inputs_dict.pop("vision_config") text_config = config_inputs_dict.pop("text_config") @@ -311,7 +306,6 @@ def prepare_config_and_inputs(self): "vision_config": vision_config, "pixel_values": pixel_values, "attention_mask": attention_mask, - "text_config": text_config, "input_ids": input_ids, "token_type_ids": token_type_ids, } @@ -362,7 +356,6 @@ def prepare_config_and_inputs(self): "vision_config": vision_config, "pixel_values": pixel_values, "attention_mask": attention_mask, - "text_config": text_config, "input_ids": input_ids, "token_type_ids": token_type_ids, } diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py index 18182047d66475..c3c2321c24ee71 100644 --- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py +++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py @@ -108,7 +108,6 @@ def check_vision_text_dual_encoder_model( def check_vision_text_dual_encoder_from_pretrained( self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs ): - vision_model, text_model = self.get_vision_text_model(vision_config, text_config) kwargs = {"vision_model": vision_model, "text_model": text_model} model = VisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs) @@ -175,7 +174,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float): self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).") def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs): - pt_model.to(torch_device) pt_model.eval() @@ -218,7 +216,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mas self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2) def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict): - config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config) pt_model = VisionTextDualEncoderModel(config) @@ -230,7 +227,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict): self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict) def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict): - config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config) pt_model = VisionTextDualEncoderModel(config) @@ -262,7 +258,6 @@ def test_vision_text_output_attention(self): @is_pt_flax_cross_test def test_pt_flax_equivalence(self): - config_inputs_dict = self.prepare_config_and_inputs() vision_config = config_inputs_dict.pop("vision_config") text_config = config_inputs_dict.pop("text_config") @@ -341,7 +336,6 @@ def prepare_config_and_inputs(self): "vision_config": vision_config, "pixel_values": pixel_values, "attention_mask": input_mask, - "text_config": text_config, "input_ids": input_ids, "text_token_type_ids": token_type_ids, "text_sequence_labels": sequence_labels, @@ -429,7 +423,6 @@ def prepare_config_and_inputs(self): "vision_config": vision_config, "pixel_values": pixel_values, "attention_mask": input_mask, - "text_config": text_config, "input_ids": input_ids, "text_token_type_ids": token_type_ids, "text_sequence_labels": sequence_labels, @@ -491,7 +484,6 @@ def prepare_config_and_inputs(self): "vision_config": vision_config, "pixel_values": pixel_values, "attention_mask": input_mask, - "text_config": text_config, "input_ids": input_ids, "text_token_type_ids": token_type_ids, "text_sequence_labels": sequence_labels, diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index 92ed812fe47d1e..00b0eb3635a54b 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -300,7 +300,6 @@ def create_and_check_for_flickr(self, config, input_dict): @require_torch class VisualBertModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( VisualBertModel, diff --git a/tests/models/vit/test_image_processing_vit.py b/tests/models/vit/test_image_processing_vit.py index ce0cc5610a8368..171ce65e74f17d 100644 --- a/tests/models/vit/test_image_processing_vit.py +++ b/tests/models/vit/test_image_processing_vit.py @@ -74,7 +74,6 @@ def prepare_image_processor_dict(self): @require_torch @require_vision class ViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = ViTImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py index 611f9364885450..ca3130493eda1d 100644 --- a/tests/models/vit/test_modeling_flax_vit.py +++ b/tests/models/vit/test_modeling_flax_vit.py @@ -25,8 +25,8 @@ if is_flax_available(): - import jax + from transformers.models.vit.modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel @@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self): @require_flax class FlaxViTModelTest(FlaxModelTesterMixin, unittest.TestCase): - all_model_classes = (FlaxViTModel, FlaxViTForImageClassification) if is_flax_available() else () def setUp(self) -> None: diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py index 7f452886f150a3..95ed814eba655e 100644 --- a/tests/models/vit/test_modeling_tf_vit.py +++ b/tests/models/vit/test_modeling_tf_vit.py @@ -206,7 +206,6 @@ def test_for_image_classification(self): @slow def test_model_from_pretrained(self): - model = TFViTModel.from_pretrained("google/vit-base-patch16-224") self.assertIsNotNone(model) diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py index 3bc582cb1fcd07..8c19c0149112b5 100644 --- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py @@ -266,7 +266,6 @@ def prepare_numpy_arrays(inputs_dict): # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict): - # make masks reproducible np.random.seed(2) @@ -453,7 +452,6 @@ def test_model_outputs_equivalence(self): @slow def test_model_from_pretrained(self): - model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224") self.assertIsNotNone(model) diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 5a48d253a385ee..35693da4cf60b1 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -209,7 +209,6 @@ def test_for_pretraining(self): # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict): - # make masks reproducible np.random.seed(2) @@ -224,7 +223,6 @@ def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict): super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict) def test_save_load(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: diff --git a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py index 98cf2f1c495bc6..44f2ed5b87362d 100644 --- a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py +++ b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py @@ -96,7 +96,6 @@ def _flatten(list_of_lists): class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = Wav2Vec2FeatureExtractor def setUp(self): diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index ac1dd3bcb44ae5..33388eb6d34f97 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -45,6 +45,7 @@ import jax.numpy as jnp import optax from flax.traverse_util import flatten_dict + from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor from transformers.models.wav2vec2.modeling_flax_wav2vec2 import ( FlaxWav2Vec2ForCTC, @@ -58,6 +59,7 @@ if is_pyctcdecode_available(): import pyctcdecode.decoder + from transformers import Wav2Vec2ProcessorWithLM from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm @@ -67,7 +69,6 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) @@ -276,7 +277,6 @@ def model_jitted(input_values, attention_mask=None, **kwargs): self.assertEqual(len(outputs), len(jitted_outputs)) for jitted_output, output in zip(jitted_outputs, outputs): - self.assertEqual(jitted_output.shape, output.shape) def test_freeze_feature_encoder(self): diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py index 8f9a8f0bd73bd7..38e83bcdf9e594 100644 --- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py @@ -26,8 +26,8 @@ import numpy as np import pytest from datasets import load_dataset - from huggingface_hub import snapshot_download + from transformers import Wav2Vec2Config, is_tf_available from transformers.testing_utils import ( CaptureLogger, @@ -53,6 +53,7 @@ if is_pyctcdecode_available(): import pyctcdecode.decoder + from transformers import Wav2Vec2ProcessorWithLM from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm @@ -62,7 +63,6 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) @@ -283,7 +283,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFWav2Vec2ModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else () test_resize_embeddings = False test_head_masking = False diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 9fe18fdf57c8ae..35df9fc223b29b 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -81,6 +81,7 @@ if is_pyctcdecode_available(): import pyctcdecode.decoder + from transformers import Wav2Vec2ProcessorWithLM from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm @@ -90,7 +91,6 @@ def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py index 829c6096814455..df5db0a3e298a9 100644 --- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py +++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py @@ -24,8 +24,8 @@ import numpy as np from datasets import load_dataset from packaging import version - from parameterized import parameterized + from transformers import AutoProcessor from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES @@ -38,6 +38,7 @@ if is_pyctcdecode_available(): from huggingface_hub import snapshot_download from pyctcdecode import BeamSearchDecoderCTC + from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM from transformers.models.wav2vec2_with_lm.processing_wav2vec2_with_lm import Wav2Vec2DecoderWithLMOutput diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index c03763cdf63f77..b490556c5fe6b3 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -113,7 +113,6 @@ def _flatten(list_of_lists): @require_torch @require_torchaudio class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase): - feature_extraction_class = WhisperFeatureExtractor if is_speech_available() else None def setUp(self): diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py index 1fd89f2525b893..3bc04e56a6d149 100644 --- a/tests/models/whisper/test_modeling_tf_whisper.py +++ b/tests/models/whisper/test_modeling_tf_whisper.py @@ -629,7 +629,6 @@ def test_lm_head_model_random_beam_search_generate(self): def _load_datasamples(num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -638,7 +637,6 @@ def _load_datasamples(num_samples): def _test_large_logits_librispeech(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) @@ -687,7 +685,6 @@ def _test_large_logits_librispeech(in_queue, out_queue, timeout): def _test_large_generation(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) @@ -715,7 +712,6 @@ def _test_large_generation(in_queue, out_queue, timeout): def _test_large_generation_multilingual(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) @@ -761,7 +757,6 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout): def _test_large_batched_generation(in_queue, out_queue, timeout): - error = None try: _ = in_queue.get(timeout=timeout) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 0944a3a64634dd..54382c2884e633 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -756,7 +756,6 @@ def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -886,7 +885,6 @@ def test_large_logits_librispeech(self): @slow def test_tiny_en_generation(self): - torch_device = "cpu" set_seed(0) processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") @@ -910,7 +908,6 @@ def test_tiny_en_generation(self): @slow def test_tiny_generation(self): - torch_device = "cpu" set_seed(0) processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index b4f3252e2fa46d..73550c061825bd 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -21,8 +21,8 @@ import unittest import numpy as np - from huggingface_hub import hf_hub_download + from transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device from transformers.utils import is_torch_available, is_vision_available @@ -393,7 +393,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = (XCLIPTextModel,) if is_torch_available() else () fx_compatible = False test_pruning = False @@ -445,7 +444,6 @@ def __init__( mit_hidden_size=64, is_training=True, ): - if text_kwargs is None: text_kwargs = {} if vision_kwargs is None: diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py index 924c73321e9036..60436cb1f9a355 100644 --- a/tests/models/xglm/test_modeling_flax_xglm.py +++ b/tests/models/xglm/test_modeling_flax_xglm.py @@ -26,10 +26,10 @@ if is_flax_available(): - import numpy as np - import jax import jax.numpy as jnp + import numpy as np + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -196,7 +196,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input @require_sentencepiece @require_flax class FlaxXGLMModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase): - all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else () all_generative_model_classes = (FlaxXGLMForCausalLM,) if is_flax_available() else () diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py index b6387901dc955b..6f4069a40c712d 100644 --- a/tests/models/xglm/test_modeling_tf_xglm.py +++ b/tests/models/xglm/test_modeling_tf_xglm.py @@ -140,7 +140,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFXGLMModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = (TFXGLMModel, TFXGLMForCausalLM) if is_tf_available() else () all_generative_model_classes = (TFXGLMForCausalLM,) if is_tf_available() else () test_onnx = False diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py index 662299fb7eb1d9..f906ebf899b72e 100644 --- a/tests/models/xglm/test_modeling_xglm.py +++ b/tests/models/xglm/test_modeling_xglm.py @@ -295,7 +295,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = (XGLMModel, XGLMForCausalLM) if is_torch_available() else () all_generative_model_classes = (XGLMForCausalLM,) if is_torch_available() else () fx_compatible = True diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py index 05259ffaf9a335..74dd4dab5e3e75 100644 --- a/tests/models/xglm/test_tokenization_xglm.py +++ b/tests/models/xglm/test_tokenization_xglm.py @@ -31,7 +31,6 @@ @require_sentencepiece @require_tokenizers class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = XGLMTokenizer rust_tokenizer_class = XGLMTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py index 00e77cee64ba89..d443b7f843b25a 100644 --- a/tests/models/xlm/test_modeling_tf_xlm.py +++ b/tests/models/xlm/test_modeling_tf_xlm.py @@ -277,7 +277,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFXLMModel, diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py index 190e1e958377cd..3e8a5efc894800 100644 --- a/tests/models/xlm/test_modeling_xlm.py +++ b/tests/models/xlm/test_modeling_xlm.py @@ -360,7 +360,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( XLMModel, diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py index adb4835eda4070..6e3103521585c8 100644 --- a/tests/models/xlm/test_tokenization_xlm.py +++ b/tests/models/xlm/test_tokenization_xlm.py @@ -25,7 +25,6 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = XLMTokenizer test_rust_tokenizer = False diff --git a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py index 5dec186bc7b9ce..4cd6eb9b5f9d5a 100644 --- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py +++ b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py @@ -142,9 +142,8 @@ def test_xprophetnet_ntg_inference(self): tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1 ] EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ") - EXPECTED_TITLE_RU_BEAM1_TOK = ( - "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года" - .split(" ") + EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split( + " " ) EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ") self.assertListEqual( diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py index d560007fe3163f..13c02b5415f8fb 100644 --- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py +++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py @@ -27,7 +27,6 @@ @require_sentencepiece class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = XLMProphetNetTokenizer test_rust_tokenizer = False test_sentencepiece = True diff --git a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py index c821cda6f3ce0c..0ceaa739f3fa86 100644 --- a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py @@ -22,6 +22,7 @@ if is_flax_available(): import jax.numpy as jnp + from transformers import FlaxXLMRobertaModel diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py index c8f934b258b93c..0dde56481cc7a5 100644 --- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py @@ -31,7 +31,6 @@ @require_sentencepiece @require_tokenizers class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = XLMRobertaTokenizer rust_tokenizer_class = XLMRobertaTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 6c9577be777fd4..4efb48cdf0b7e0 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -358,7 +358,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): - all_model_classes = ( ( XLMRobertaXLForCausalLM, diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py index bc8f31006bd4ba..a8686d4a2b1f78 100644 --- a/tests/models/xlnet/test_modeling_tf_xlnet.py +++ b/tests/models/xlnet/test_modeling_tf_xlnet.py @@ -332,7 +332,6 @@ def prepare_config_and_inputs_for_common(self): @require_tf class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( TFXLNetModel, diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py index 6125a1dffd7791..a9f39202f4a175 100644 --- a/tests/models/xlnet/test_tokenization_xlnet.py +++ b/tests/models/xlnet/test_tokenization_xlnet.py @@ -27,7 +27,6 @@ @require_sentencepiece @require_tokenizers class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = XLNetTokenizer rust_tokenizer_class = XLNetTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index b262a654a5a4a6..937cb6fac6e131 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False): @require_torch @require_vision class YolosImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase): - image_processing_class = YolosImageProcessor if is_vision_available() else None def setUp(self): diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index 0a0749dd7d9bcd..5f72f992eea541 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -281,7 +281,6 @@ def prepare_config_and_inputs_for_common(self): @require_torch class YosoModelTest(ModelTesterMixin, unittest.TestCase): - all_model_classes = ( ( YosoModel, diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index e7a0e15d243d8d..792c2a65cb0c44 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -5,8 +5,8 @@ from unittest.mock import patch import pytest - from parameterized import parameterized + from transformers import AutoConfig, PreTrainedTokenizerBase, is_tf_available, is_torch_available from transformers.onnx import ( EXTERNAL_DATA_FORMAT_SIZE_LIMIT, diff --git a/tests/optimization/test_optimization.py b/tests/optimization/test_optimization.py index c0c5a31a3a49de..551b24a48ec59a 100644 --- a/tests/optimization/test_optimization.py +++ b/tests/optimization/test_optimization.py @@ -120,7 +120,6 @@ def assertListAlmostEqual(self, list1, list2, tol, msg=None): self.assertAlmostEqual(a, b, delta=tol, msg=msg) def test_schedulers(self): - common_kwargs = {"num_warmup_steps": 2, "num_training_steps": 10} # schedulers doct format # function: (sched_args_dict, expected_learning_rates) diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 413fc7657f6cfc..2bda09fe00c7fe 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -17,8 +17,8 @@ import numpy as np import pytest from datasets import load_dataset - from huggingface_hub import snapshot_download + from transformers import ( MODEL_FOR_CTC_MAPPING, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, @@ -277,7 +277,6 @@ def test_torch_small_no_tokenizer_files(self): @require_torch @slow def test_torch_large(self): - speech_recognizer = pipeline( task="automatic-speech-recognition", model="facebook/wav2vec2-base-960h", @@ -645,7 +644,6 @@ def test_simple_wav2vec2(self): @require_torch @require_torchaudio def test_simple_s2t(self): - model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st") tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st") feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st") diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 2213ec6ca89a71..8e38c6dcd6186b 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -26,10 +26,10 @@ import datasets import numpy as np - import requests from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token from requests.exceptions import HTTPError + from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, @@ -119,7 +119,6 @@ def is_test_to_skip(test_casse_name, config_class, model_architecture, tokenizer # TODO: check and fix if possible if not to_skip and tokenizer_name is not None: - if ( test_casse_name == "QAPipelineTests" and not tokenizer_name.endswith("Fast") @@ -196,7 +195,6 @@ def is_test_to_skip(test_casse_name, config_class, model_architecture, tokenizer def validate_test_components(test_case, model, tokenizer, processor): - # TODO: Move this to tiny model creation script # head-specific (within a model type) necessary changes to the config # 1. for `BlenderbotForCausalLM` @@ -296,7 +294,6 @@ def data(n): mapping = dct.get(key, {}) if mapping: for config_class, model_architectures in mapping.items(): - if not isinstance(model_architectures, tuple): model_architectures = (model_architectures,) diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py index d79f2f2abafc40..8e13407246558c 100644 --- a/tests/pipelines/test_pipelines_depth_estimation.py +++ b/tests/pipelines/test_pipelines_depth_estimation.py @@ -44,7 +44,6 @@ def hashimage(image: Image) -> str: @require_timm @require_torch class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): - model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING def get_test_pipeline(self, model, tokenizer, processor): diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py index 3ac4b70737ca0b..a1a0c7b3714504 100644 --- a/tests/pipelines/test_pipelines_image_segmentation.py +++ b/tests/pipelines/test_pipelines_image_segmentation.py @@ -18,9 +18,9 @@ import datasets import numpy as np +import requests from datasets import load_dataset -import requests from transformers import ( MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py index 601606a4cd6cb0..907419618339a5 100644 --- a/tests/pipelines/test_pipelines_video_classification.py +++ b/tests/pipelines/test_pipelines_video_classification.py @@ -15,6 +15,7 @@ import unittest from huggingface_hub import hf_hub_download + from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor from transformers.pipelines import VideoClassificationPipeline, pipeline from transformers.testing_utils import ( @@ -47,7 +48,6 @@ def get_test_pipeline(self, model, tokenizer, processor): return video_classifier, examples def run_pipeline_test(self, video_classifier, examples): - for example in examples: outputs = video_classifier(example) diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py index fea8bf1c48e995..b84ba20a4c9c45 100644 --- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py +++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py @@ -33,7 +33,6 @@ def open(*args, **kwargs): @require_vision @require_torch class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): - model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING def get_test_pipeline(self, model, tokenizer, processor): diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py index 8dde0f49443b9c..2411990a16d847 100644 --- a/tests/repo_utils/test_check_dummies.py +++ b/tests/repo_utils/test_check_dummies.py @@ -20,7 +20,7 @@ git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) sys.path.append(os.path.join(git_repo_path, "utils")) -import check_dummies +import check_dummies # noqa: E402 from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init # noqa: E402 diff --git a/tests/sagemaker/conftest.py b/tests/sagemaker/conftest.py index 8e7c0bbf1d0cda..0ed261864bcd27 100644 --- a/tests/sagemaker/conftest.py +++ b/tests/sagemaker/conftest.py @@ -4,7 +4,6 @@ import os import pytest - from attr import dataclass diff --git a/tests/sagemaker/scripts/tensorflow/run_tf.py b/tests/sagemaker/scripts/tensorflow/run_tf.py index a47e76c09d6125..03f631d2667995 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf.py @@ -10,7 +10,6 @@ if __name__ == "__main__": - parser = argparse.ArgumentParser() # Hyperparameters sent by the client are passed as command-line arguments to the script. diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py index 84f4275aafce3a..f8f2e4bcf29d49 100644 --- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py +++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py @@ -85,7 +85,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size): if __name__ == "__main__": - parser = argparse.ArgumentParser() # Hyperparameters sent by the client are passed as command-line arguments to the script. @@ -143,7 +142,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size): # Training if args.do_train: - # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size) start_train_time = time.time() train_results = fit( @@ -171,7 +169,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size): # Evaluation if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0): - result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True) logger.info("*** Evaluate ***") diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py index 8fb60d64a61f8c..cc7f9e5e84f8bf 100644 --- a/tests/sagemaker/test_multi_node_data_parallel.py +++ b/tests/sagemaker/test_multi_node_data_parallel.py @@ -5,7 +5,6 @@ from ast import literal_eval import pytest - from parameterized import parameterized, parameterized_class from . import is_sagemaker_available diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py index 38a1c9a6b3b7bd..95d5b9fa855904 100644 --- a/tests/sagemaker/test_multi_node_model_parallel.py +++ b/tests/sagemaker/test_multi_node_model_parallel.py @@ -5,7 +5,6 @@ from ast import literal_eval import pytest - from parameterized import parameterized, parameterized_class from . import is_sagemaker_available @@ -50,7 +49,6 @@ def setUp(self): assert hasattr(self, "env") def create_estimator(self, instance_count): - # configuration for running training on smdistributed Model Parallel mpi_options = { "enabled": True, diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py index e71f82d31634e0..f2a62547e787c6 100644 --- a/tests/sagemaker/test_single_node_gpu.py +++ b/tests/sagemaker/test_single_node_gpu.py @@ -5,7 +5,6 @@ from ast import literal_eval import pytest - from parameterized import parameterized_class from . import is_sagemaker_available diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index df05d2a4ac84fd..dc9927f1938527 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -25,6 +25,7 @@ from huggingface_hub import HfFolder, delete_repo, set_access_token from requests.exceptions import HTTPError + from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available from transformers.configuration_utils import PretrainedConfig from transformers.testing_utils import TOKEN, USER, is_staging_test diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py index 5c60cf58ac2533..ee8dfefb8406a0 100644 --- a/tests/test_feature_extraction_common.py +++ b/tests/test_feature_extraction_common.py @@ -24,6 +24,7 @@ from huggingface_hub import HfFolder, delete_repo, set_access_token from requests.exceptions import HTTPError + from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor from transformers.testing_utils import TOKEN, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index d8485e3853d840..e18f8bf60f6fc1 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -24,6 +24,7 @@ from huggingface_hub import HfFolder, delete_repo, set_access_token from requests.exceptions import HTTPError + from transformers import AutoImageProcessor, ViTImageProcessor from transformers.testing_utils import ( TOKEN, diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py index 206c8dc5b8fbf1..2287fdbf2ce99d 100644 --- a/tests/test_image_transforms.py +++ b/tests/test_image_transforms.py @@ -16,8 +16,8 @@ import unittest import numpy as np - from parameterized import parameterized + from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 4b3051a13fd4f3..b78e7775515e9f 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -30,11 +30,11 @@ from typing import Dict, List, Tuple import numpy as np - -import transformers from huggingface_hub import HfFolder, delete_repo, set_access_token from huggingface_hub.file_download import http_get from requests.exceptions import HTTPError + +import transformers from transformers import ( AutoConfig, AutoModel, @@ -108,9 +108,9 @@ if is_torch_available(): import torch + from test_module.custom_modeling import CustomModel, NoSuperInitModel from torch import nn - from test_module.custom_modeling import CustomModel, NoSuperInitModel from transformers import ( BERT_PRETRAINED_MODEL_ARCHIVE_LIST, MODEL_MAPPING, @@ -160,6 +160,7 @@ def forward(self, x): if is_flax_available(): import jax.numpy as jnp + from transformers.modeling_flax_pytorch_utils import ( convert_pytorch_state_dict_to_flax, load_flax_weights_in_pytorch_model, @@ -183,7 +184,6 @@ def _config_zero_init(config): @require_torch class ModelTesterMixin: - model_tester = None all_model_classes = () all_generative_model_classes = () @@ -417,7 +417,6 @@ def test_save_load_fast_init_to_base(self): base_class = base_class[0] for model_class in self.all_model_classes: - if model_class == base_class: continue @@ -706,7 +705,6 @@ def test_torchscript_output_hidden_state(self): # This is copied from `torch/testing/_internal/jit_utils.py::clear_class_registry` def clear_torch_jit_class_registry(self): - torch._C._jit_clear_class_registry() torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() # torch 1.8 has no `_clear_class_state` in `torch.jit._state` @@ -1512,7 +1510,6 @@ def test_correct_missing_keys(self): base_model_prefix = model.base_model_prefix if hasattr(model, base_model_prefix): - extra_params = {k: v for k, v in model.named_parameters() if not k.startswith(base_model_prefix)} extra_params.update({k: v for k, v in model.named_buffers() if not k.startswith(base_model_prefix)}) # Some models define this as None @@ -1854,7 +1851,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam ) def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): - tf_inputs_dict = {} for key, tensor in pt_inputs_dict.items(): # skip key that does not exist in tf @@ -1875,7 +1871,6 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict): return tf_inputs_dict def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict): - tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict) # send pytorch inputs to the correct device @@ -1907,7 +1902,6 @@ def test_pt_tf_model_equivalence(self): import transformers for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() tf_model_class_name = "TF" + model_class.__name__ # Add the "TF" at the beginning @@ -2544,7 +2538,6 @@ def test_problem_types(self): for problem_type in problem_types: with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"): - config.problem_type = problem_type["title"] config.num_labels = problem_type["num_labels"] diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py index fe1cb694350daf..c9fe7cc4719b6c 100644 --- a/tests/test_modeling_flax_common.py +++ b/tests/test_modeling_flax_common.py @@ -21,10 +21,10 @@ from typing import List, Tuple import numpy as np - -import transformers from huggingface_hub import HfFolder, delete_repo, set_access_token from requests.exceptions import HTTPError + +import transformers from transformers import BertConfig, is_flax_available, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import ( @@ -48,6 +48,7 @@ from flax.core.frozen_dict import FrozenDict, freeze, unfreeze from flax.serialization import from_bytes from flax.traverse_util import flatten_dict, unflatten_dict + from transformers import ( FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, @@ -275,7 +276,6 @@ def test_equivalence_pt_to_flax(self): for model_class in self.all_model_classes: with self.subTest(model_class.__name__): - # Output all for aggressive testing config.output_hidden_states = True config.output_attentions = self.has_attentions @@ -328,7 +328,6 @@ def test_equivalence_flax_to_pt(self): for model_class in self.all_model_classes: with self.subTest(model_class.__name__): - # Output all for aggressive testing config.output_hidden_states = True config.output_attentions = self.has_attentions @@ -569,7 +568,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs): self.assertEqual(len(outputs), len(jitted_outputs)) for jitted_output, output in zip(jitted_outputs, outputs): - self.assertEqual(jitted_output.shape, output.shape) def test_forward_signature(self): diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index eaf8f82e78e88a..ced3c0f86afe6d 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -28,10 +28,10 @@ from typing import List, Tuple, get_type_hints from datasets import Dataset - from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token from huggingface_hub.file_download import http_get from requests.exceptions import HTTPError + from transformers import is_tf_available, is_torch_available from transformers.configuration_utils import PretrainedConfig from transformers.models.auto import get_values @@ -157,7 +157,6 @@ def _return_type_has_loss(model): @require_tf class TFModelTesterMixin: - model_tester = None all_model_classes = () all_generative_model_classes = () @@ -618,7 +617,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam ) def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): - pt_inputs_dict = {} for name, key in tf_inputs_dict.items(): if type(key) == bool: @@ -638,7 +636,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict): return pt_inputs_dict def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict): - pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict) # send pytorch inputs to the correct device @@ -670,7 +667,6 @@ def test_pt_tf_model_equivalence(self): import transformers for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # Output all for aggressive testing @@ -1069,7 +1065,6 @@ def test_determinism(self): self.assertLessEqual(max_diff, 1e-5) def test_model_outputs_equivalence(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): diff --git a/tests/test_sequence_feature_extraction_common.py b/tests/test_sequence_feature_extraction_common.py index 0114f4016da695..710ad01250f9e9 100644 --- a/tests/test_sequence_feature_extraction_common.py +++ b/tests/test_sequence_feature_extraction_common.py @@ -23,7 +23,6 @@ class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin): - # to overwrite at feature extractactor specific tests feat_extract_tester = None feature_extraction_class = None diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 8ca460449e249c..83b857cda9ac03 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -34,6 +34,7 @@ from huggingface_hub.file_download import http_get from parameterized import parameterized from requests.exceptions import HTTPError + from transformers import ( AlbertTokenizer, AlbertTokenizerFast, @@ -131,7 +132,6 @@ def merge_model_tokenizer_mappings( class TokenizerTesterMixin: - tokenizer_class = None rust_tokenizer_class = None test_slow_tokenizer = True @@ -915,7 +915,6 @@ def test_encode_decode_with_spaces(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - new_toks = [ AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False), @@ -953,7 +952,6 @@ def test_mask_output(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if ( tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer" and "token_type_ids" in tokenizer.model_input_names @@ -1004,7 +1002,6 @@ def test_number_of_added_tokens(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - seq_0 = "Test this method." seq_1 = "With these inputs." @@ -2140,7 +2137,6 @@ def test_pretokenized_inputs(self): tokenizers = self.get_tokenizers(do_lower_case=False) # , add_prefix_space=True) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space: continue @@ -2373,7 +2369,6 @@ def test_torch_encode_plus_sent_to_model(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): - if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING: return @@ -2956,7 +2951,6 @@ def test_batch_encode_dynamic_overflowing(self): tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): - if is_torch_available(): returned_tensor = "pt" elif is_tf_available(): @@ -3579,7 +3573,6 @@ def test_compare_prepare_for_model(self): def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index a655b84dc16c68..186fabb7aea086 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -23,14 +23,12 @@ import numpy as np -# Ensure there are no circular imports when importing the parent class -from transformers import PreTrainedTokenizerFast - from transformers import ( BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, + PreTrainedTokenizerFast, TensorType, TokenSpan, is_tokenizers_available, diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 47f3ebbc442ecd..a09865448d789d 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -29,10 +29,10 @@ from unittest.mock import Mock, patch import numpy as np - from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token from parameterized import parameterized from requests.exceptions import HTTPError + from transformers import ( AutoTokenizer, IntervalStrategy, @@ -565,7 +565,6 @@ def test_adafactor_lr_none(self): @require_torch_gpu @require_torch_bf16_gpu def test_mixed_bf16(self): - # very basic test trainer = get_regression_trainer(learning_rate=0.1, bf16=True) trainer.train() @@ -580,7 +579,6 @@ def test_mixed_bf16(self): @require_torch_gpu @require_torch_tf32 def test_tf32(self): - # very basic test trainer = get_regression_trainer(learning_rate=0.1, tf32=True) trainer.train() @@ -1289,7 +1287,6 @@ def test_resume_training_with_randomness(self): @require_accelerate @require_torch_non_multi_gpu def test_auto_batch_size_finder(self): - if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True @@ -1736,7 +1733,6 @@ def check_mem_metrics(self, trainer, check_func): check_func("test_mem_gpu_alloc_delta", metrics) def test_mem_metrics(self): - # with mem metrics enabled trainer = get_regression_trainer(skip_memory_metrics=False) self.check_mem_metrics(trainer, self.assertIn) @@ -1747,7 +1743,6 @@ def test_mem_metrics(self): @require_torch_gpu def test_fp16_full_eval(self): - # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis. # it's using pretty large safety margins, but small enough to detect broken functionality. debug = 0 @@ -2467,7 +2462,6 @@ class MyTrialShortNamer(TrialShortNamer): DEFAULTS = {"a": 0, "b": 0} def hp_space(trial): - return { "method": "random", "metric": {}, diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 68d07e0f60f331..5fa6edb1c88f1b 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -66,7 +66,6 @@ def forward(self, input_ids, labels=None): class TestTrainerDistributedNeuronCore(TestCasePlus): @require_torch_neuroncore def test_trainer(self): - distributed_args = f""" -m torch.distributed.launch --nproc_per_node=2 @@ -83,7 +82,6 @@ def test_trainer(self): class TestTrainerDistributed(TestCasePlus): @require_torch_multi_gpu def test_trainer(self): - distributed_args = f""" -m torch.distributed.launch --nproc_per_node={torch.cuda.device_count()} diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py index da824f4743823d..0ad3c9c2ac4672 100644 --- a/tests/utils/test_hf_argparser.py +++ b/tests/utils/test_hf_argparser.py @@ -24,6 +24,7 @@ from typing import List, Optional import yaml + from transformers import HfArgumentParser, TrainingArguments from transformers.hf_argparser import make_choice_type_function, string_to_bool diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py index c8c7d0faad70c1..272ea26fa32e1a 100644 --- a/tests/utils/test_hub_utils.py +++ b/tests/utils/test_hub_utils.py @@ -19,6 +19,7 @@ from pathlib import Path from requests.exceptions import HTTPError + from transformers.utils import ( CONFIG_NAME, FLAX_WEIGHTS_NAME, diff --git a/tests/utils/test_logging.py b/tests/utils/test_logging.py index 81f3d9144ad78f..5f0b5fe32533e5 100644 --- a/tests/utils/test_logging.py +++ b/tests/utils/test_logging.py @@ -15,8 +15,9 @@ import os import unittest -import transformers.models.bart.tokenization_bart from huggingface_hub.utils import are_progress_bars_disabled + +import transformers.models.bart.tokenization_bart from transformers import logging from transformers.testing_utils import CaptureLogger, mockenv, mockenv_context from transformers.utils.logging import disable_progress_bar, enable_progress_bar diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py index 6a68718aac94f3..77958335072353 100644 --- a/tests/utils/test_modeling_tf_core.py +++ b/tests/utils/test_modeling_tf_core.py @@ -62,7 +62,6 @@ @require_tf class TFCoreModelTesterMixin: - model_tester = None all_model_classes = () all_generative_model_classes = () diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py index 708accc7a63885..1fda8804a88a88 100644 --- a/tests/utils/test_offline.py +++ b/tests/utils/test_offline.py @@ -22,7 +22,6 @@ class OfflineTests(TestCasePlus): @require_torch def test_offline_mode(self): - # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before # `transformers` is loaded, and it's too late for inside pytest - so we are changing it # while running an external program @@ -108,7 +107,6 @@ def offline_socket(*args, **kwargs): raise socket.error("Faking flaky internet") @require_torch def test_offline_mode_sharded_checkpoint(self): - # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before # `transformers` is loaded, and it's too late for inside pytest - so we are changing it # while running an external program diff --git a/tests/utils/test_skip_decorators.py b/tests/utils/test_skip_decorators.py index 89ff0e3bafdc2b..6888fea23cffd4 100644 --- a/tests/utils/test_skip_decorators.py +++ b/tests/utils/test_skip_decorators.py @@ -32,8 +32,8 @@ import unittest import pytest - from parameterized import parameterized + from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device diff --git a/tests/utils/test_versions_utils.py b/tests/utils/test_versions_utils.py index 6bd77218d69feb..e691a2bcc383e1 100644 --- a/tests/utils/test_versions_utils.py +++ b/tests/utils/test_versions_utils.py @@ -84,7 +84,6 @@ def test_core(self): self.assertIn("need one of ", str(e)) def test_python(self): - # matching requirement require_version("python>=3.6.0") diff --git a/utils/check_copies.py b/utils/check_copies.py index 48c1096f2b75d1..b10da732d32cf3 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -177,7 +177,7 @@ def blackify(code): has_indent = len(get_indent(code)) > 0 if has_indent: code = f"class Bla:\n{code}" - mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119, preview=True) + mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=119) result = black.format_str(code, mode=mode) result, _ = style_docstrings_in_code(result) return result[len("class Bla:\n") :] if has_indent else result diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py index 330832e04cb91a..c81c3d8e212ed4 100644 --- a/utils/check_doctest_list.py +++ b/utils/check_doctest_list.py @@ -22,7 +22,6 @@ if __name__ == "__main__": - doctest_file_path = os.path.join(REPO_PATH, "utils/documentation_tests.txt") non_existent_paths = [] with open(doctest_file_path) as fp: diff --git a/utils/check_repo.py b/utils/check_repo.py index 37fd498b64e5f5..cf921a7024a47f 100755 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -445,7 +445,7 @@ def get_model_test_files(): path = os.path.join(target_dir, file_or_dir) if os.path.isfile(path): filename = os.path.split(path)[-1] - if "test_modeling" in filename and not os.path.splitext(filename)[0] in _ignore_files: + if "test_modeling" in filename and os.path.splitext(filename)[0] not in _ignore_files: file = os.path.join(*path.split(os.sep)[1:]) test_files.append(file) diff --git a/utils/check_self_hosted_runner.py b/utils/check_self_hosted_runner.py index f7303366ea7840..0eadfcf5f9c0e1 100644 --- a/utils/check_self_hosted_runner.py +++ b/utils/check_self_hosted_runner.py @@ -4,7 +4,6 @@ def get_runner_status(target_runners, token): - offline_runners = [] cmd = ( diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py index e2844f6aee12f3..47c150d6e84c68 100644 --- a/utils/create_dummy_models.py +++ b/utils/create_dummy_models.py @@ -25,10 +25,10 @@ import tempfile from pathlib import Path -from datasets import load_dataset - from check_config_docstrings import get_checkpoint_from_config_class +from datasets import load_dataset from huggingface_hub import Repository, create_repo, upload_folder + from transformers import ( CONFIG_MAPPING, FEATURE_EXTRACTOR_MAPPING, @@ -350,7 +350,6 @@ def get_tiny_config(config_class, **model_tester_kwargs): def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast): - new_tokenizer = tokenizer_fast.train_new_from_iterator(training_ds["text"], TARGET_VOCAB_SIZE, show_progress=False) # Make sure it at least runs @@ -361,7 +360,6 @@ def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast): def convert_feature_extractor(feature_extractor, tiny_config): - to_convert = False kwargs = {} if hasattr(tiny_config, "image_size"): @@ -574,7 +572,6 @@ def upload_model(model_dir, organization): raise ValueError(error) with tempfile.TemporaryDirectory() as tmpdir: - repo = Repository(local_dir=tmpdir, clone_from=f"{organization}/{repo_name}") repo.git_pull() shutil.copytree(model_dir, tmpdir, dirs_exist_ok=True) @@ -599,7 +596,6 @@ def upload_model(model_dir, organization): def build_composite_models(config_class, output_dir): - import tempfile from transformers import ( @@ -668,7 +664,6 @@ def build_composite_models(config_class, output_dir): tf_model_class = None with tempfile.TemporaryDirectory() as tmpdir: - try: # build encoder models_to_create = {"processor": encoder_processor, "pytorch": (encoder_class,), "tensorflow": []} @@ -761,7 +756,6 @@ def get_token_id_from_tokenizer(token_id_name, tokenizer, original_token_id): def get_config_overrides(config_class, processors): - config_overrides = {} # Check if there is any tokenizer (prefer fast version if any) @@ -990,7 +984,6 @@ def build(config_class, models_to_create, output_dir): def build_failed_report(results, include_warning=True): - failed_results = {} for config_name in results: if "error" in results[config_name]: @@ -1021,7 +1014,6 @@ def build_failed_report(results, include_warning=True): def build_simple_report(results): - text = "" failed_text = "" for config_name in results: @@ -1040,7 +1032,6 @@ def build_simple_report(results): if __name__ == "__main__": - clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) if os.getcwd() != clone_path: raise ValueError(f"This script should be run from the root of the clone of `transformers` {clone_path}") diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py index c17ce139569f3e..d250ce7e64815a 100644 --- a/utils/custom_init_isort.py +++ b/utils/custom_init_isort.py @@ -96,6 +96,7 @@ def _inner(x): def sort_objects(objects, key=None): "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str." + # If no key is provided, we use a noop. def noop(x): return x @@ -117,6 +118,7 @@ def sort_objects_in_import(import_statement): """ Return the same `import_statement` but with objects properly sorted. """ + # This inner function sort imports between [ ]. def _replace(match): imports = match.groups()[0] diff --git a/utils/extract_warnings.py b/utils/extract_warnings.py index 48912ea6f58f72..cb609e861503b8 100644 --- a/utils/extract_warnings.py +++ b/utils/extract_warnings.py @@ -5,6 +5,7 @@ import zipfile from get_ci_error_statistics import download_artifact, get_artifacts_links + from transformers import logging diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py index 790ec5e3d565c3..b6642dce9c0d6f 100644 --- a/utils/get_ci_error_statistics.py +++ b/utils/get_ci_error_statistics.py @@ -209,7 +209,6 @@ def make_github_table_per_model(reduced_by_model): if __name__ == "__main__": - parser = argparse.ArgumentParser() # Required parameters parser.add_argument( diff --git a/utils/notification_service.py b/utils/notification_service.py index da315dc56aef3d..9967db447e772f 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -413,7 +413,6 @@ def payload(self) -> str: @staticmethod def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False): - blocks = [] title_block = {"type": "header", "text": {"type": "plain_text", "text": title}} blocks.append(title_block) @@ -691,7 +690,6 @@ def prepare_reports(title, header, reports, to_truncate=True): if __name__ == "__main__": - runner_status = os.environ.get("RUNNER_STATUS") runner_env_status = os.environ.get("RUNNER_ENV_STATUS") setup_status = os.environ.get("SETUP_STATUS") @@ -832,7 +830,6 @@ def prepare_reports(title, header, reports, to_truncate=True): for line in artifact["summary_short"].split("\n"): if re.search("FAILED", line): - line = line.replace("FAILED ", "") line = line.split()[0].replace("\n", "") @@ -897,7 +894,6 @@ def prepare_reports(title, header, reports, to_truncate=True): } for key in additional_results.keys(): - # If a whole suite of test fails, the artifact isn't available. if additional_files[key] not in available_artifacts: additional_results[key]["error"] = True diff --git a/utils/notification_service_doc_tests.py b/utils/notification_service_doc_tests.py index 7d5605c1cae3b0..3aabeaec0f3120 100644 --- a/utils/notification_service_doc_tests.py +++ b/utils/notification_service_doc_tests.py @@ -323,7 +323,6 @@ def add_path(self, path: str): if __name__ == "__main__": - github_actions_job_links = get_job_links() available_artifacts = retrieve_available_artifacts() @@ -359,7 +358,6 @@ def add_path(self, path: str): all_failures = extract_first_line_failure(artifact["failures_short"]) for line in artifact["summary_short"].split("\n"): if re.search("FAILED", line): - line = line.replace("FAILED ", "") line = line.split()[0].replace("\n", "") diff --git a/utils/update_metadata.py b/utils/update_metadata.py index d6dea03a8ba6e3..e52d93fe62d925 100644 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -22,7 +22,6 @@ import pandas as pd from datasets import Dataset - from huggingface_hub import Repository