From 9b0e3defd4d7c2a0ab618382e07815dd4b2e607f Mon Sep 17 00:00:00 2001 From: yzqzss Date: Fri, 26 Jul 2024 18:21:47 +0800 Subject: [PATCH] skip non-utf8 content --- src/googlrot/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/googlrot/main.py b/src/googlrot/main.py index ebc1173..d93193c 100644 --- a/src/googlrot/main.py +++ b/src/googlrot/main.py @@ -110,6 +110,9 @@ async def code_mode(g: Github, googl_perfix_queue_collection: AsyncIOMotorCollec continue else: raise e + except UnicodeDecodeError: + logger.error(f"UnicodeDecodeError: {result.repository.full_name}, skip") + continue print("conetnt: ", content[:256]) for url in extractor.gen_urls(content): assert isinstance(url, str)