Skip to content

Commit

Permalink
Avoid invalid escape sequences, use raw strings (huggingface#22936)
Browse files Browse the repository at this point in the history
* Avoid invalid escape sequences, use raw strings

* Integrate PR feedback
  • Loading branch information
Lingepumpe authored Apr 25, 2023
1 parent 81c1910 commit 5427250
Show file tree
Hide file tree
Showing 24 changed files with 61 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def add_arguments(parser):
group.add_argument("--quant-disable", action="store_true", help="disable all quantizers")
group.add_argument("--quant-disable-embeddings", action="store_true", help="disable all embeddings quantizers")
group.add_argument("--quant-disable-keyword", type=str, nargs="+", help="disable quantizers by keyword")
group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.\d+.")
group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer.\d+.")
group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.")
group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer")
group.add_argument("--calibrator", default="max", help="which quantization range calibrator to use")
group.add_argument("--percentile", default=None, type=float, help="percentile for PercentileCalibrator")
group.add_argument("--fuse-qkv", action="store_true", help="use the same scale factor for qkv")
Expand Down Expand Up @@ -94,10 +94,10 @@ def configure_model(model, args, calib=False, eval=False):
set_quantizer_by_name(model, args.quant_disable_keyword, _disabled=True)

if args.quant_disable_layer_module:
set_quantizer_by_name(model, ["layer.\d+." + args.quant_disable_layer_module], _disabled=True)
set_quantizer_by_name(model, [r"layer.\d+." + args.quant_disable_layer_module], _disabled=True)

if args.quant_enable_layer_module:
set_quantizer_by_name(model, ["layer.\d+." + args.quant_enable_layer_module], _disabled=False)
set_quantizer_by_name(model, [r"layer.\d+." + args.quant_enable_layer_module], _disabled=False)

if args.recalibrate_weights:
recalibrate_weights(model)
Expand Down
2 changes: 1 addition & 1 deletion examples/research_projects/wav2vec2/run_asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ def main():
target_sr = processor.feature_extractor.sampling_rate if data_args.target_feature_extractor_sampling_rate else None
vocabulary_chars_str = "".join(t for t in processor.tokenizer.get_vocab().keys() if len(t) == 1)
vocabulary_text_cleaner = re.compile( # remove characters not in vocabulary
f"[^\s{re.escape(vocabulary_chars_str)}]", # allow space in addition to chars in vocabulary
rf"[^\s{re.escape(vocabulary_chars_str)}]", # allow space in addition to chars in vocabulary
flags=re.IGNORECASE if processor.tokenizer.do_lower_case else 0,
)
text_updates = []
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ target-version = ['py37']

[tool.ruff]
# Never enforce `E501` (line length violations).
ignore = ["C901", "E501", "E741", "W605"]
ignore = ["C901", "E501", "E741"]
select = ["C", "E", "F", "I", "W"]
line-length = 119

Expand Down
18 changes: 9 additions & 9 deletions src/transformers/commands/add_new_model_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def find_indent(line: str) -> int:
"""
Returns the number of spaces that start a line indent.
"""
search = re.search("^(\s*)(?:\S|$)", line)
search = re.search(r"^(\s*)(?:\S|$)", line)
if search is None:
return 0
return len(search.groups()[0])
Expand Down Expand Up @@ -519,7 +519,7 @@ def duplicate_module(
with open(module_file, "r", encoding="utf-8") as f:
content = f.read()

content = re.sub("# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content)
content = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content)
objects = parse_module_content(content)

# Loop and treat all objects
Expand Down Expand Up @@ -568,7 +568,7 @@ def duplicate_module(
# Regular classes functions
old_obj = obj
obj, replacement = replace_model_patterns(obj, old_model_patterns, new_model_patterns)
has_copied_from = re.search("^#\s+Copied from", obj, flags=re.MULTILINE) is not None
has_copied_from = re.search(r"^#\s+Copied from", obj, flags=re.MULTILINE) is not None
if add_copied_from and not has_copied_from and _re_class_func.search(obj) is not None and len(replacement) > 0:
# Copied from statement must be added just before the class/function definition, which may not be the
# first line because of decorators.
Expand Down Expand Up @@ -667,7 +667,7 @@ def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) ->
return {"doc_file": doc_file, "model_files": model_files, "module_name": module_name, "test_files": test_files}


_re_checkpoint_for_doc = re.compile("^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE)
_re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE)


def find_base_model_checkpoint(
Expand Down Expand Up @@ -913,8 +913,8 @@ def clean_frameworks_in_init(
idx += 1
# Otherwise we keep the line, except if it's a tokenizer import and we don't want to keep it.
elif keep_processing or (
re.search('^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None
and re.search("^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx])
re.search(r'^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None
and re.search(r"^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx])
is None
):
new_lines.append(lines[idx])
Expand Down Expand Up @@ -1192,7 +1192,7 @@ def duplicate_doc_file(
with open(doc_file, "r", encoding="utf-8") as f:
content = f.read()

content = re.sub("<!--\s*Copyright (\d+)\s", f"<!--Copyright {CURRENT_YEAR} ", content)
content = re.sub(r"<!--\s*Copyright (\d+)\s", f"<!--Copyright {CURRENT_YEAR} ", content)
if frameworks is None:
frameworks = get_default_frameworks()
if dest_file is None:
Expand All @@ -1218,7 +1218,7 @@ def duplicate_doc_file(
if not block.startswith("#"):
new_blocks.append(block)
# Main title
elif re.search("^#\s+\S+", block) is not None:
elif re.search(r"^#\s+\S+", block) is not None:
new_blocks.append(f"# {new_model_patterns.model_name}\n")
# The config starts the part of the doc with the classes.
elif not in_classes and old_model_patterns.config_class in block.split("\n")[0]:
Expand All @@ -1230,7 +1230,7 @@ def duplicate_doc_file(
elif in_classes:
in_classes = True
block_title = block.split("\n")[0]
block_class = re.search("^#+\s+(\S.*)$", block_title).groups()[0]
block_class = re.search(r"^#+\s+(\S.*)$", block_title).groups()[0]
new_block, _ = replace_model_patterns(block, old_model_patterns, new_model_patterns)

if "Tokenizer" in block_class:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1829,7 +1829,7 @@ def save_pretrained(

# make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "")
reg = re.compile("(.*?)-\d{5}-of-\d{5}")
reg = re.compile(r"(.*?)-\d{5}-of-\d{5}")

if (
filename.startswith(weights_no_suffix)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def layer_name_mapping(key, file):
def get_dtype_size(dtype):
if dtype == torch.bool:
return 1 / 8
bit_search = re.search("[^\d](\d+)$", str(dtype))
bit_search = re.search(r"[^\d](\d+)$", str(dtype))
if bit_search is None:
raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
bit_size = int(bit_search.groups()[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_fo
)
parser.add_argument(
"--cvt_file_name",
default="cvtmodels\CvT-w24-384x384-IN-22k.pth",
default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth",
type=str,
help="Input Image Size",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1823,7 +1823,7 @@ def forward(
)
class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel):
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
_keys_to_ignore_on_load_missing = ["bbox_embed\.[1-9]\d*", "class_embed\.[1-9]\d*"]
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]

def __init__(self, config: DeformableDetrConfig):
super().__init__(config)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/deta/modeling_deta.py
Original file line number Diff line number Diff line change
Expand Up @@ -1775,7 +1775,7 @@ def forward(
)
class DetaForObjectDetection(DetaPreTrainedModel):
# When using clones, all layers > 0 will be clones, but layer 0 *is* required
_keys_to_ignore_on_load_missing = ["bbox_embed\.[1-9]\d*", "class_embed\.[1-9]\d*"]
_keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"]

# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta
def __init__(self, config: DetaConfig):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ def rename_key(old_name, num_meta4D_last_stage):
else:
new_name = old_name.replace("4", "batchnorm_after")

if "network" in old_name and re.search("\d\.\d", old_name):
if "network" in old_name and re.search(r"\d\.\d", old_name):
two_digit_num = r"\b\d{2}\b"
if bool(re.search(two_digit_num, old_name)):
match = re.search("\d\.\d\d.", old_name).group()
match = re.search(r"\d\.\d\d.", old_name).group()
else:
match = re.search("\d\.\d.", old_name).group()
match = re.search(r"\d\.\d.", old_name).group()
if int(match[0]) < 6:
trimmed_name = old_name.replace(match, "")
trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1])
Expand All @@ -78,7 +78,7 @@ def rename_key(old_name, num_meta4D_last_stage):

new_name = "last_stage." + trimmed_name

elif "network" in old_name and re.search(".\d.", old_name):
elif "network" in old_name and re.search(r".\d.", old_name):
new_name = old_name.replace("network", "intermediate_stages")

if "fc" in new_name:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/glpn/modeling_glpn.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]:


class SiLogLoss(nn.Module):
"""
r"""
Implements the Scale-invariant log scale loss [Eigen et al., 2014](https://arxiv.org/abs/1406.2283).
$$L=\frac{1}{n} \sum_{i} d_{i}^{2}-\frac{1}{2 n^{2}}\left(\sum_{i} d_{i}^{2}\right)$$ where $d_{i}=\log y_{i}-\log
Expand Down
18 changes: 9 additions & 9 deletions src/transformers/models/jukebox/convert_jukebox.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,23 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
new_dict = {}
import re

re_encoder_block_conv_in = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_encoder_block_resnet = re.compile(
"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_encoder_block_proj_out = re.compile("encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")

re_decoder_block_conv_out = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)")
re_decoder_block_resnet = re.compile(
"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_decoder_block_proj_in = re.compile("decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")
re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)")

re_prior_cond_conv_out = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)")
re_prior_cond_resnet = re.compile(
"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)"
)
re_prior_cond_proj_in = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")

for original_key, value in state_dict.items():
# rename vqvae.encoder keys
Expand Down
8 changes: 4 additions & 4 deletions src/transformers/models/jukebox/tokenization_jukebox.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,10 +148,10 @@ def __init__(
with open(lyrics_file, encoding="utf-8") as vocab_handle:
self.lyrics_encoder = json.load(vocab_handle)

oov = "[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
# In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
if len(self.lyrics_encoder) == 79:
oov = oov.replace("\-'", "\-+'")
oov = oov.replace(r"\-'", r"\-+'")

self.out_of_vocab = regex.compile(oov)
self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
Expand Down Expand Up @@ -230,7 +230,7 @@ def prepare_for_tokenization(
] # split is for the full dictionary with combined genres

if self.version[0] == "v2":
self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
self.vocab["<unk>"] = 0
Expand All @@ -239,7 +239,7 @@ def prepare_for_tokenization(
self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
self.lyrics_decoder[0] = ""
else:
self.out_of_vocab = regex.compile("[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")

lyrics = self._run_strip_accents(lyrics)
lyrics = lyrics.replace("\\", "\n")
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/maskformer/modeling_maskformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,7 +1241,7 @@ def forward(self, features: List[Tensor]) -> List[Tensor]:

class MaskFormerPixelDecoder(nn.Module):
def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256, **kwargs):
"""
r"""
Pixel Decoder Module proposed in [Per-Pixel Classification is Not All You Need for Semantic
Segmentation](https://arxiv.org/abs/2107.06278). It first runs the backbone's features into a Feature Pyramid
Network creating a list of feature maps. Then, it projects the last one to the correct `mask_size`.
Expand All @@ -1250,7 +1250,7 @@ def __init__(self, *args, feature_size: int = 256, mask_feature_size: int = 256,
feature_size (`int`, *optional*, defaults to 256):
The feature size (channel dimension) of the FPN feature maps.
mask_feature_size (`int`, *optional*, defaults to 256):
The features (channels) of the target masks size \\C_{\epsilon}\\ in the paper.
The features (channels) of the target masks size \\(C_{\epsilon}\\) in the paper.
"""
super().__init__()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]

# The regex to extract layer names.
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")

# The simple map of names for "automated" rules.
megatron_to_transformers = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):
pp_size = megatron_args.pipeline_model_parallel_size
dtype = torch.float32
# The regex to extract layer names.
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")

# Convert.
print("Converting")
Expand Down Expand Up @@ -746,7 +746,7 @@ def convert_checkpoint_from_transformers_to_megatron(args):
)
num_layers = config.num_hidden_layers // args.target_pipeline_model_parallel_size

layer_re = re.compile("transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
layer_re = re.compile(r"transformer.h\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
# The number of heads.
heads = config.n_head
# The hidden_size per head.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
transformer = lm["transformer"] if "transformer" in lm.keys() else lm["encoder"]

# The regex to extract layer names.
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")
layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)")

# The simple map of names for "automated" rules.
megatron_to_transformers = {
Expand Down
12 changes: 6 additions & 6 deletions tests/sagemaker/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ class SageMakerTestEnvironment:
def metric_definitions(self) -> str:
if self.framework == "pytorch":
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
{"Name": "train_runtime", "Regex": r"train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": r"eval_accuracy.*=\D*(.*?)$"},
{"Name": "eval_loss", "Regex": r"eval_loss.*=\D*(.*?)$"},
]
else:
return [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "loss.*=\D*(.*?)]?$"},
{"Name": "eval_loss", "Regex": "sparse_categorical_accuracy.*=\D*(.*?)]?$"},
{"Name": "train_runtime", "Regex": r"train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": r"loss.*=\D*(.*?)]?$"},
{"Name": "eval_loss", "Regex": r"sparse_categorical_accuracy.*=\D*(.*?)]?$"},
]

@property
Expand Down
8 changes: 4 additions & 4 deletions tests/utils/test_add_new_model_like.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,14 +157,14 @@ def test_add_content_to_text(self):
self.assertEqual(
add_content_to_text(test_text, line, add_before=' "bert": "BertConfig",', exact_match=True), expected
)
self.assertEqual(add_content_to_text(test_text, line, add_before=re.compile('^\s*"bert":')), expected)
self.assertEqual(add_content_to_text(test_text, line, add_before=re.compile(r'^\s*"bert":')), expected)

self.assertEqual(add_content_to_text(test_text, line, add_after="gpt"), expected)
self.assertEqual(add_content_to_text(test_text, line, add_after="gpt", exact_match=True), test_text)
self.assertEqual(
add_content_to_text(test_text, line, add_after=' "gpt": "GPTConfig",', exact_match=True), expected
)
self.assertEqual(add_content_to_text(test_text, line, add_after=re.compile('^\s*"gpt":')), expected)
self.assertEqual(add_content_to_text(test_text, line, add_after=re.compile(r'^\s*"gpt":')), expected)

def test_add_content_to_file(self):
test_text = """all_configs = {
Expand Down Expand Up @@ -197,7 +197,7 @@ def test_add_content_to_file(self):
self.check_result(file_name, expected)

self.init_file(file_name, test_text)
add_content_to_file(file_name, line, add_before=re.compile('^\s*"bert":'))
add_content_to_file(file_name, line, add_before=re.compile(r'^\s*"bert":'))
self.check_result(file_name, expected)

self.init_file(file_name, test_text)
Expand All @@ -213,7 +213,7 @@ def test_add_content_to_file(self):
self.check_result(file_name, expected)

self.init_file(file_name, test_text)
add_content_to_file(file_name, line, add_after=re.compile('^\s*"gpt":'))
add_content_to_file(file_name, line, add_after=re.compile(r'^\s*"gpt":'))
self.check_result(file_name, expected)

def test_simplify_replacements(self):
Expand Down
Loading

0 comments on commit 5427250

Please sign in to comment.