Skip to content

Commit

Permalink
Specify UTF-8 encoding for every open function (#1283)
Browse files Browse the repository at this point in the history
Co-authored-by: William Falcon <[email protected]>
  • Loading branch information
Andrei-Aksionov and williamFalcon authored Apr 15, 2024
1 parent 67d0b0c commit 410a712
Show file tree
Hide file tree
Showing 14 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion litgpt/data/tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def val_dataloader(self) -> DataLoader:


def tokenize(filename: str, tokenizer: Tokenizer):
with open(filename, "r") as f:
with open(filename, "r", encoding="utf-8") as f:
data = json.load(f)
global_rank = int(os.environ["DATA_OPTIMIZER_GLOBAL_RANK"])
num_workers = int(os.environ["DATA_OPTIMIZER_NUM_WORKERS"])
Expand Down
2 changes: 1 addition & 1 deletion litgpt/eval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def convert_and_evaluate(
save_filepath = out_dir / Path("results.json") if save_filepath is None else Path(save_filepath)
config_filepath = checkpoint_dir/"model_config.yaml"

with open(config_filepath) as f:
with open(config_filepath, encoding="utf-8") as f:
config_dict = yaml.safe_load(f)
repo_id = f"{config_dict['hf_config']['org']}/{config_dict['hf_config']['name']}"

Expand Down
4 changes: 2 additions & 2 deletions litgpt/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,12 @@ def save_prompt_style(style: Union[str, PromptStyle], checkpoint_dir: Path) -> N
cls = type(style)
# Allow saving the full module path for user-defined prompt classes
config = {"class_path": f"{cls.__module__}.{cls.__name__}"}
with open(checkpoint_dir / "prompt_style.yaml", "w") as file:
with open(checkpoint_dir / "prompt_style.yaml", "w", encoding="utf-8") as file:
yaml.dump(config, file)


def load_prompt_style(checkpoint_dir: Path) -> PromptStyle:
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
config = yaml.safe_load(file)
# Support loading the full module path for user-defined prompt classes
full_module_path, cls_name = config["class_path"].rsplit(".", 1)
Expand Down
2 changes: 1 addition & 1 deletion litgpt/scripts/convert_hf_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def convert_hf_checkpoint(
# Load the json file containing weight mapping
pytorch_bin_map_json_path = checkpoint_dir / "pytorch_model.bin.index.json"
if pytorch_bin_map_json_path.is_file(): # not all checkpoints have this file
with open(pytorch_bin_map_json_path) as json_map:
with open(pytorch_bin_map_json_path, encoding="utf-8") as json_map:
bin_index = json.load(json_map)
bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
else:
Expand Down
2 changes: 1 addition & 1 deletion litgpt/scripts/merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def load_lora_metadata(checkpoint_dir: Path) -> Tuple[Dict[str, Any], Path, Opti
f" the `litgpt/finetune/lora.py` script."
)

with open(hparams_file, "r") as file:
with open(hparams_file, "r", encoding="utf-8") as file:
hparams = yaml.safe_load(file)

lora_params = {k: v for k, v in hparams.items() if k.startswith("lora_")}
Expand Down
6 changes: 3 additions & 3 deletions litgpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ def __init__(self, checkpoint_dir: Union[Path, str]) -> None:
self.backend = "huggingface"

if (special_tokens_path := checkpoint_dir / "tokenizer_config.json").is_file():
with open(special_tokens_path) as fp:
with open(special_tokens_path, encoding="utf-8") as fp:
config = json.load(fp)
bos_token = config.get("bos_token")
self.bos_id = self.token_to_id(bos_token) if bos_token is not None else None
eos_token = config.get("eos_token")
self.eos_id = self.token_to_id(eos_token) if eos_token is not None else None
if (special_tokens_path := checkpoint_dir / "generation_config.json").is_file():
with open(special_tokens_path) as fp:
with open(special_tokens_path, encoding="utf-8") as fp:
config = json.load(fp)
if self.bos_id is None:
self.bos_id = config.get("bos_token_id")
Expand Down Expand Up @@ -71,7 +71,7 @@ def token_to_id(self, token: str) -> int:
def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
if not (tokenizer_config_path := checkpoint_dir / "tokenizer_config.json").is_file():
return False
with open(tokenizer_config_path) as fp:
with open(tokenizer_config_path, encoding="utf-8") as fp:
config = json.load(fp)
if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
return True
Expand Down
2 changes: 1 addition & 1 deletion litgpt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def save_hyperparameters(function: callable, checkpoint_dir: Path) -> None:

def save_config(config: "Config", checkpoint_dir: Path) -> None:
config_dict = asdict(config)
with open(checkpoint_dir / "model_config.yaml", "w") as fp:
with open(checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
yaml.dump(config_dict, fp)


Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_tinystories.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_tokenize(tmp_path, monkeypatch):
story1, story2 = "foo bar", " fun "
data = [{"story": story1}, {"story": story2}]
shard_path = tmp_path / "data.json"
with open(shard_path, "w") as f:
with open(shard_path, "w", encoding="utf-8") as f:
json.dump(data, f)

class Tokenizer:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_from_checkpoint(tmp_path):

# 3. If only `lit_config.py` exists.
config_data = {"name": "pythia-14m", "block_size": 24, "n_layer": 2}
with open(tmp_path / "model_config.yaml", "w") as file:
with open(tmp_path / "model_config.yaml", "w", encoding="utf-8") as file:
yaml.dump(config_data, file)
config = Config.from_checkpoint(tmp_path)
assert config.name == "pythia-14m"
Expand All @@ -69,7 +69,7 @@ def test_from_checkpoint(tmp_path):

# 4. Both `lit_config.py` and a matching config exist, but `lit_config.py` supersedes matching config
(tmp_path / "pythia-14m").mkdir()
with open(tmp_path / "pythia-14m/model_config.yaml", "w") as file:
with open(tmp_path / "pythia-14m/model_config.yaml", "w", encoding="utf-8") as file:
yaml.dump(config_data, file)
config = Config.from_checkpoint(tmp_path / "pythia-14m")
assert config.name == "pythia-14m"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_convert_lit_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_convert_lit_checkpoint(tmp_path):
checkpoint_path = tmp_path / "lit_model.pth"
config_path = tmp_path / "model_config.yaml"
torch.save(ours_model.state_dict(), checkpoint_path)
with open(config_path, "w") as fp:
with open(config_path, "w", encoding="utf-8") as fp:
yaml.dump(asdict(ours_config), fp)
output_dir = tmp_path / "out_dir"

Expand Down
2 changes: 1 addition & 1 deletion tests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_evaluate_script(tmp_path, monkeypatch):
checkpoint_path = tmp_path / "lit_model.pth"
torch.save(ours_model.state_dict(), checkpoint_path)
config_path = tmp_path / "model_config.yaml"
with open(config_path, "w") as fp:
with open(config_path, "w", encoding="utf-8") as fp:
yaml.dump(asdict(ours_config), fp)

fn_kwargs = dict(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)

# Create a fake pretrained checkpoint
config = dict(block_size=128, padded_vocab_size=256, n_layer=3, n_head=8, n_embd=16)
with open(pretrained_checkpoint_dir / "model_config.yaml", "w") as fp:
with open(pretrained_checkpoint_dir / "model_config.yaml", "w", encoding="utf-8") as fp:
yaml.dump(config, fp)
base_model = GPT.from_name("pythia-14m", **config).to(dtype=pretrained_dtype)
state_dict = base_model.state_dict()
Expand All @@ -45,7 +45,7 @@ def test_merge_lora(tmp_path, fake_checkpoint_dir, pretrained_dtype, lora_dtype)
assert len(state_dict) == 6
torch.save(state_dict, lora_checkpoint_dir / "lit_model.pth.lora")
hparams = dict(checkpoint_dir=str(pretrained_checkpoint_dir), **lora_kwargs)
with open(lora_checkpoint_dir / "hyperparameters.yaml", "w") as file:
with open(lora_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
yaml.dump(hparams, file)
shutil.copyfile(pretrained_checkpoint_dir / "model_config.yaml", lora_checkpoint_dir / "model_config.yaml")

Expand Down Expand Up @@ -80,7 +80,7 @@ def test_load_lora_metadata(fake_checkpoint_dir):
load_lora_metadata(fake_checkpoint_dir)

hparams = dict(precision="bf16-mixed", checkpoint_dir="checkpoints/meta-llama/Llama-2-7b", lora_r=8, lora_alpha=16)
with open(fake_checkpoint_dir / "hyperparameters.yaml", "w") as file:
with open(fake_checkpoint_dir / "hyperparameters.yaml", "w", encoding="utf-8") as file:
yaml.dump(hparams, file)

lora_args, pretrained_dir, precision = load_lora_metadata(fake_checkpoint_dir)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_save_load_prompt_style(tmp_path):
assert not has_prompt_style(checkpoint_dir)
save_prompt_style("alpaca", checkpoint_dir)
assert has_prompt_style(checkpoint_dir)
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
contents = yaml.safe_load(file)
assert contents == {"class_path": "litgpt.prompts.Alpaca"}
loaded = load_prompt_style(checkpoint_dir)
Expand All @@ -108,7 +108,7 @@ def test_save_load_prompt_style(tmp_path):
checkpoint_dir = tmp_path / "custom"
checkpoint_dir.mkdir()
save_prompt_style(CustomPromptStyle(), checkpoint_dir)
with open(checkpoint_dir / "prompt_style.yaml", "r") as file:
with open(checkpoint_dir / "prompt_style.yaml", "r", encoding="utf-8") as file:
contents = yaml.safe_load(file)
assert contents == {"class_path": "test_prompts.CustomPromptStyle"}
loaded = load_prompt_style(checkpoint_dir)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_save_hyperparameters(tmp_path):
with mock.patch("sys.argv", ["any.py", "--out_dir", str(tmp_path), "--foo", "True"]):
CLI(_test_function)

with open(tmp_path / "hyperparameters.yaml", "r") as file:
with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
hparams = yaml.full_load(file)

assert hparams["out_dir"] == str(tmp_path)
Expand All @@ -277,7 +277,7 @@ def test_save_hyperparameters_known_commands(command, tmp_path):
with mock.patch("sys.argv", [*command.split(" "), "--out_dir", str(tmp_path), "--foo", "True"]):
save_hyperparameters(_test_function2, tmp_path)

with open(tmp_path / "hyperparameters.yaml", "r") as file:
with open(tmp_path / "hyperparameters.yaml", "r", encoding="utf-8") as file:
hparams = yaml.full_load(file)

assert hparams["out_dir"] == str(tmp_path)
Expand Down

0 comments on commit 410a712

Please sign in to comment.