From a91b5206dfbfd5651410381a075a1fc982d8d28c Mon Sep 17 00:00:00 2001
From: Andrei-Aksionov <58434077+Andrei-Aksionov@users.noreply.github.com>
Date: Tue, 23 Apr 2024 19:03:08 +0300
Subject: [PATCH] Tokenizer: `add_prefix_space` shouldn't affect 
 `self.use_bos` (#1328)

---
 litgpt/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/litgpt/tokenizer.py b/litgpt/tokenizer.py
index 55c972e69a..8217fcd069 100644
--- a/litgpt/tokenizer.py
+++ b/litgpt/tokenizer.py
@@ -73,11 +73,11 @@ def check_if_bos_token_used(self, checkpoint_dir: Path) -> bool:
             return False
         with open(tokenizer_config_path, encoding="utf-8") as fp:
             config = json.load(fp)
-        if any(config.get(check, False) for check in ("add_bos_token", "add_prefix_space")):
-            return True
-        # for examples that also use the Llama tokenizer, but do not have or set add_bos_token to True.
+        if "add_bos_token" in config:
+            return config["add_bos_token"]
+        # if `add_bos_token` isn't in the config file, but LLaMA tokenizer is used - return True.
         # ex: https://huggingface.co/stabilityai/StableBeluga2/blob/main/tokenizer_config.json#L2
-        return config.get("add_bos_token") is None and config.get("tokenizer_class") == "LlamaTokenizer"
+        return config.get("tokenizer_class") == "LlamaTokenizer"
 
     def encode(
         self,