From 62ca3e30aa7da4f7b88a4a878ee5a72d1ab97520 Mon Sep 17 00:00:00 2001 From: Andrei-Aksionov Date: Fri, 19 Apr 2024 18:10:32 +0300 Subject: [PATCH] add_prefix_space shouldn't trigger use_bos --- litgpt/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py index 55c972e69a..8217fcd069 100644 --- a/litgpt/tokenizer.py +++ b/litgpt/tokenizer.py @@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool: return False with open(tokenizer_config_path, encoding="utf-8") as fp: config = json.load(fp) - if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")): - return True - # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True. + if "add_bos_token" in config: + return config["add_bos_token"] + # if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True. # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2 - return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer" + return config.get("tokenizer_class") == "LlamaTokenizer" def encode( self,