From a91b5206dfbfd5651410381a075a1fc982d8d28c Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com> Date: Tue, 23 Apr 2024 19:03:08 +0300 Subject: [PATCH] Tokenizer: `add_prefix_space` shouldn't affect `self.use_bos` (#1328) --- litgpt/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py index 55c972e69a..8217fcd069 100644 --- a/litgpt/tokenizer.py +++ b/litgpt/tokenizer.py @@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: return False with open(tokenizer_config_path, encoding="utf-8") as fp: config = json.load(fp) - if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")): - return True - # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. + if "add_bos_token" in config: + return config["add_bos_token"] + # if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True. # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2 - return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer" + return config.get("tokenizer_class") == "LlamaTokenizer" def encode( self,