From 62ca3e30aa7da4f7b88a4a878ee5a72d1ab97520 Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <aksionau.andrei@gmail.com>
Date: Fri, 19 Apr 2024 18:10:32 +0300
Subject: [PATCH] add_prefix_space shouldn't trigger use_bos

---
 litgpt/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py
index 55c972e69a..8217fcd069 100644
--- a/litgpt/tokenizer.py
+++ b/litgpt/tokenizer.py
@@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
             return False
         with open(tokenizer_config_path, encoding="utf-8") as fp:
             config = json.load(fp)
-        if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
-            return True
-        # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
+        if "add_bos_token" in config:
+            return config["add_bos_token"]
+        # if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True.
         # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
-        return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
+        return config.get("tokenizer_class") == "LlamaTokenizer"
 
     def encode(
         self,