diff --git a/lightrag/llm.py b/lightrag/llm.py index 6a64244a..b3af5b99 100644 --- a/lightrag/llm.py +++ b/lightrag/llm.py @@ -29,6 +29,7 @@ from .utils import ( wrap_embedding_func_with_attrs, locate_json_string_body_from_string, + safe_unicode_decode, ) import sys @@ -85,14 +86,14 @@ async def inner(): if content is None: continue if r"\u" in content: - content = content.encode("utf-8").decode("unicode_escape") + content = safe_unicode_decode(content.encode("utf-8")) yield content return inner() else: content = response.choices[0].message.content if r"\u" in content: - content = content.encode("utf-8").decode("unicode_escape") + content = safe_unicode_decode(content.encode("utf-8")) return content diff --git a/lightrag/utils.py b/lightrag/utils.py index d79cc1a2..0220af06 100644 --- a/lightrag/utils.py +++ b/lightrag/utils.py @@ -507,3 +507,20 @@ async def save_to_cache(hashing_kv, cache_data: CacheData): } await hashing_kv.upsert({cache_data.mode: mode_cache}) + + +def safe_unicode_decode(content): + # Regular expression to find all Unicode escape sequences of the form \uXXXX + unicode_escape_pattern = re.compile(r"\\u([0-9a-fA-F]{4})") + + # Function to replace the Unicode escape with the actual character + def replace_unicode_escape(match): + # Convert the matched hexadecimal value into the actual Unicode character + return chr(int(match.group(1), 16)) + + # Perform the substitution + decoded_content = unicode_escape_pattern.sub( + replace_unicode_escape, content.decode("utf-8") + ) + + return decoded_content