Specify UTF-8 encoding for every open function (#1283)

Co-authored-by: William Falcon <[email protected]>
Lightning-AI · Apr 15, 2024 · 410a712 · 410a712
1 parent 67d0b0c
commit 410a712
Show file tree

Hide file tree

Showing 14 changed files with 22 additions and 22 deletions.
diff --git a/litgpt/data/tinystories.py b/litgpt/data/tinystories.py
@@ -106,7 +106,7 @@ def val_dataloader(self) -> DataLoader:
 
 
 def tokenize(filename: str, tokenizer: Tokenizer):
-    with open(filename, "r") as f:
+    with open(filename, "r", encoding="utf-8") as f:
         data = json.load(f)
     global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
     num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"])

diff --git a/litgpt/eval/evaluate.py b/litgpt/eval/evaluate.py
@@ -92,7 +92,7 @@ def convert_and_evaluate(
     save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
     config_filepath = checkpoint_dir/"model_config.yaml"
 
-    with open(config_filepath) as f:
+    with open(config_filepath, encoding="utf-8") as f:
         config_dict = yaml.safe_load(f)
     repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"
 

diff --git a/litgpt/prompts.py b/litgpt/prompts.py
@@ -340,12 +340,12 @@ def save_prompt_style(style: Union[str, PromptStyle], checkpoint_dir: Path) -> N
     cls = type(style)
     # Allow saving the full module path for user-defined prompt classes
     config = {"class_path": f"{cls.__module__}.{cls.__name__}"}
-    with open(checkpoint_dir / "prompt_style.yaml", "w") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config, file)
 
 
 def load_prompt_style(checkpoint_dir: Path) -> PromptStyle:
-    with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         config = yaml.safe_load(file)
     # Support loading the full module path for user-defined prompt classes
     full_module_path, cls_name = config["class_path"].rsplit(".", 1)

diff --git a/litgpt/scripts/convert_hf_checkpoint.py b/litgpt/scripts/convert_hf_checkpoint.py
@@ -329,7 +329,7 @@ def convert_hf_checkpoint(
     # Load the json file containing weight mapping
     pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
     if pytorch_bin_map_json_path.is_file():  # not all checkpoints have this file
-        with open(pytorch_bin_map_json_path) as json_map:
+        with open(pytorch_bin_map_json_path, encoding="utf-8") as json_map:
             bin_index = json.load(json_map)
         bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
     else:

diff --git a/litgpt/scripts/merge_lora.py b/litgpt/scripts/merge_lora.py
@@ -72,7 +72,7 @@ def load_lora_metadata(checkpoint_dir: Path) -> Tuple[Dict[str, Any], Path, Opti
             f" the `litgpt/finetune/lora.py` script."
         )
 
-    with open(hparams_file, "r") as file:
+    with open(hparams_file, "r", encoding="utf-8") as file:
         hparams = yaml.safe_load(file)
 
     lora_params = {k: v for k, v in hparams.items() if k.startswith("lora_")}

diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py
@@ -33,14 +33,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
             self.backend = "huggingface"
 
             if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
-                with open(special_tokens_path) as fp:
+                with open(special_tokens_path, encoding="utf-8") as fp:
                     config = json.load(fp)
                 bos_token = config.get("bos_token")
                 self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
                 eos_token = config.get("eos_token")
                 self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
             if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
-                with open(special_tokens_path) as fp:
+                with open(special_tokens_path, encoding="utf-8") as fp:
                     config = json.load(fp)
                 if self.bos_id is None:
                     self.bos_id = config.get("bos_token_id")
@@ -71,7 +71,7 @@ def token_to_id(self, token: str) -> int:
     def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
         if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
             return False
-        with open(tokenizer_config_path) as fp:
+        with open(tokenizer_config_path, encoding="utf-8") as fp:
             config = json.load(fp)
         if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
             return True

diff --git a/litgpt/utils.py b/litgpt/utils.py
@@ -446,7 +446,7 @@ def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None:
 
 def save_config(config: "Config", checkpoint_dir: Path) -> None:
     config_dict = asdict(config)
-    with open(checkpoint_dir / "model_config.yaml", "w") as fp:
+    with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
         yaml.dump(config_dict, fp)
 
 

diff --git a/tests/data/test_tinystories.py b/tests/data/test_tinystories.py
@@ -43,7 +43,7 @@ def test_tokenize(tmp_path, monkeypatch):
     story1, story2 = "foo bar", "    fun    "
     data = [{"story": story1}, {"story": story2}]
     shard_path = tmp_path / "data.json"
-    with open(shard_path, "w") as f:
+    with open(shard_path, "w", encoding="utf-8") as f:
         json.dump(data, f)
 
     class Tokenizer:

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -60,7 +60,7 @@ def test_from_checkpoint(tmp_path):
 
     # 3. If only `lit_config.py` exists.
     config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
-    with open(tmp_path / "model_config.yaml", "w") as file:
+    with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config_data, file)
     config = Config.from_checkpoint(tmp_path)
     assert config.name == "pythia-14m"
@@ -69,7 +69,7 @@ def test_from_checkpoint(tmp_path):
 
     # 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
     (tmp_path / "pythia-14m").mkdir()
-    with open(tmp_path / "pythia-14m/model_config.yaml", "w") as file:
+    with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
         yaml.dump(config_data, file)
     config = Config.from_checkpoint(tmp_path / "pythia-14m")
     assert config.name == "pythia-14m"

diff --git a/tests/test_convert_lit_checkpoint.py b/tests/test_convert_lit_checkpoint.py
@@ -35,7 +35,7 @@ def test_convert_lit_checkpoint(tmp_path):
     checkpoint_path = tmp_path / "lit_model.pth"
     config_path = tmp_path / "model_config.yaml"
     torch.save(ours_model.state_dict(), checkpoint_path)
-    with open(config_path, "w") as fp:
+    with open(config_path, "w", encoding="utf-8") as fp:
         yaml.dump(asdict(ours_config), fp)
     output_dir = tmp_path / "out_dir"
 

diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py
@@ -35,7 +35,7 @@ def test_evaluate_script(tmp_path, monkeypatch):
     checkpoint_path = tmp_path / "lit_model.pth"
     torch.save(ours_model.state_dict(), checkpoint_path)
     config_path = tmp_path / "model_config.yaml"
-    with open(config_path, "w") as fp:
+    with open(config_path, "w", encoding="utf-8") as fp:
         yaml.dump(asdict(ours_config), fp)
 
     fn_kwargs = dict(

diff --git a/tests/test_merge_lora.py b/tests/test_merge_lora.py
@@ -31,7 +31,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)
 
     # Create a fake pretrained checkpoint
     config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
-    with open(pretrained_checkpoint_dir / "model_config.yaml", "w") as fp:
+    with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
         yaml.dump(config, fp)
     base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype)
     state_dict = base_model.state_dict()
@@ -45,7 +45,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)
     assert len(state_dict) == 6
     torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora")
     hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs)
-    with open(lora_checkpoint_dir / "hyperparameters.yaml", "w") as file:
+    with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
         yaml.dump(hparams, file)
     shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml")
 
@@ -80,7 +80,7 @@ def test_load_lora_metadata(fake_checkpoint_dir):
         load_lora_metadata(fake_checkpoint_dir)
 
     hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16)
-    with open(fake_checkpoint_dir / "hyperparameters.yaml", "w") as file:
+    with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
         yaml.dump(hparams, file)
 
     lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir)

diff --git a/tests/test_prompts.py b/tests/test_prompts.py
@@ -98,7 +98,7 @@ def test_save_load_prompt_style(tmp_path):
     assert not has_prompt_style(checkpoint_dir)
     save_prompt_style("alpaca", checkpoint_dir)
     assert has_prompt_style(checkpoint_dir)
-    with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         contents = yaml.safe_load(file)
     assert contents == {"class_path": "litgpt.prompts.Alpaca"}
     loaded = load_prompt_style(checkpoint_dir)
@@ -108,7 +108,7 @@ def test_save_load_prompt_style(tmp_path):
     checkpoint_dir = tmp_path / "custom"
     checkpoint_dir.mkdir()
     save_prompt_style(CustomPromptStyle(), checkpoint_dir)
-    with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
+    with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
         contents = yaml.safe_load(file)
     assert contents == {"class_path": "test_prompts.CustomPromptStyle"}
     loaded = load_prompt_style(checkpoint_dir)

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -250,7 +250,7 @@ def test_save_hyperparameters(tmp_path):
     with mock.patch("sys.argv", ["any.py", "--out_dir", str(tmp_path), "--foo", "True"]):
         CLI(_test_function)
 
-    with open(tmp_path / "hyperparameters.yaml", "r") as file:
+    with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
         hparams = yaml.full_load(file)
 
     assert hparams["out_dir"] == str(tmp_path)
@@ -277,7 +277,7 @@ def test_save_hyperparameters_known_commands(command, tmp_path):
     with mock.patch("sys.argv", [*command.split(" "), "--out_dir", str(tmp_path), "--foo", "True"]):
         save_hyperparameters(_test_function2, tmp_path)
 
-    with open(tmp_path / "hyperparameters.yaml", "r") as file:
+    with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
         hparams = yaml.full_load(file)
 
     assert hparams["out_dir"] == str(tmp_path)