Skip to content

Commit

Permalink
Tokenizer: add_prefix_space shouldn't affect self.use_bos (#1328)
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrei-Aksionov authored Apr 23, 2024
1 parent 5de6fdf commit a91b520
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions litgpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
return False
with open(tokenizer_config_path, encoding="utf-8") as fp:
config = json.load(fp)
if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
return True
# for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
if "add_bos_token" in config:
return config["add_bos_token"]
# if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True.
# ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
return config.get("tokenizer_class") == "LlamaTokenizer"

def encode(
self,
Expand Down

0 comments on commit a91b520

Please sign in to comment.