From bb5e8b86f60ba5d64112a4ef13b4b67ab7903e1c Mon Sep 17 00:00:00 2001 From: yzqzss Date: Fri, 26 Jul 2024 18:20:01 +0800 Subject: [PATCH] skip 404 content --- src/googlrot/main.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/googlrot/main.py b/src/googlrot/main.py index e0c211a..ebc1173 100644 --- a/src/googlrot/main.py +++ b/src/googlrot/main.py @@ -5,6 +5,7 @@ import os from github import Github, Auth +from github.GithubException import UnknownObjectException as GithubUnknownObjectException from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from pymongo.server_api import ServerApi @@ -101,7 +102,14 @@ async def code_mode(g: Github, googl_perfix_queue_collection: AsyncIOMotorCollec for result in g.search_code(f"goo.gl/{prefix} AND NOT is:fork"): logger.info(f"Processing {result.repository.full_name} ==") - content = result.decoded_content.decode("utf-8") + try: + content = result.decoded_content.decode("utf-8") + except GithubUnknownObjectException as e: + if e.message and "404" in e.message: + logger.error(f"404: {e}, skip") + continue + else: + raise e print("conetnt: ", content[:256]) for url in extractor.gen_urls(content): assert isinstance(url, str)