From 4e0f56a1ee4c9bd54e73cec99f98df5a96a3dabb Mon Sep 17 00:00:00 2001 From: Neil Mehta Date: Wed, 27 Nov 2024 11:48:29 -0500 Subject: [PATCH 1/3] Open tokenizer.json within context manager --- mlx_vlm/tokenizer_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlx_vlm/tokenizer_utils.py b/mlx_vlm/tokenizer_utils.py index 209096c..b3103cc 100644 --- a/mlx_vlm/tokenizer_utils.py +++ b/mlx_vlm/tokenizer_utils.py @@ -314,7 +314,8 @@ def load_tokenizer(model_path, return_tokenizer=True, tokenizer_config_extra={}) tokenizer_file = model_path / "tokenizer.json" if tokenizer_file.exists(): - tokenizer_content = json.load(tokenizer_file.open()) + with open(tokenizer_file, "r") as f: + tokenizer_content = json.load(f) if "decoder" in tokenizer_content: if _is_spm_decoder(tokenizer_content["decoder"]): detokenizer_class = SPMStreamingDetokenizer From cfcdb67ef426c93c3f6452178421d5b5fd305a1c Mon Sep 17 00:00:00 2001 From: Neil Mehta Date: Wed, 27 Nov 2024 15:38:33 -0500 Subject: [PATCH 2/3] Update mlx_vlm/tokenizer_utils.py Co-authored-by: Prince Canuma --- mlx_vlm/tokenizer_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mlx_vlm/tokenizer_utils.py b/mlx_vlm/tokenizer_utils.py index b3103cc..4381282 100644 --- a/mlx_vlm/tokenizer_utils.py +++ b/mlx_vlm/tokenizer_utils.py @@ -315,7 +315,14 @@ def load_tokenizer(model_path, return_tokenizer=True, tokenizer_config_extra={}) tokenizer_file = model_path / "tokenizer.json" if tokenizer_file.exists(): with open(tokenizer_file, "r") as f: - tokenizer_content = json.load(f) + try: + tokenizer_content = json.load(f) + except JSONDecodeError as e: + raise JSONDecodeError( + "Failed to parse tokenizer.json", + e.doc, + e.pos + ) if "decoder" in tokenizer_content: if _is_spm_decoder(tokenizer_content["decoder"]): detokenizer_class = SPMStreamingDetokenizer From 3b62a23c32be30f1d1079425de7c95307a730a6a Mon Sep 17 00:00:00 2001 From: Neil Mehta Date: Wed, 27 Nov 2024 15:39:17 -0500 Subject: [PATCH 3/3] Add import statement and run pre-commit --- mlx_vlm/tokenizer_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mlx_vlm/tokenizer_utils.py b/mlx_vlm/tokenizer_utils.py index 4381282..946d916 100644 --- a/mlx_vlm/tokenizer_utils.py +++ b/mlx_vlm/tokenizer_utils.py @@ -1,5 +1,6 @@ import json from functools import partial +from json import JSONDecodeError from transformers import AutoTokenizer @@ -318,11 +319,7 @@ def load_tokenizer(model_path, return_tokenizer=True, tokenizer_config_extra={}) try: tokenizer_content = json.load(f) except JSONDecodeError as e: - raise JSONDecodeError( - "Failed to parse tokenizer.json", - e.doc, - e.pos - ) + raise JSONDecodeError("Failed to parse tokenizer.json", e.doc, e.pos) if "decoder" in tokenizer_content: if _is_spm_decoder(tokenizer_content["decoder"]): detokenizer_class = SPMStreamingDetokenizer