diff --git a/setup.py b/setup.py index ab2c4cc..f9437d3 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="wechatarticles", - version="0.5.7", + version="0.5.8", author="wnma3mz", author_email="wnma3mz@gmail.com", description="wechat articles scrapy", @@ -14,9 +14,7 @@ long_description_content_type="text/markdown", url="https://github.com/wnma3mz/wechat_articles_spider", packages=setuptools.find_packages(), - install_requires=[ - 'requests>=2.20.0', 'beautifulsoup4>=4.7.1' - ], + install_requires=["requests>=2.20.0", "beautifulsoup4>=4.7.1"], classifiers=( "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", diff --git a/wechatarticles/ArticlesAPI.py b/wechatarticles/ArticlesAPI.py index 73e5e2a..0a9fcef 100644 --- a/wechatarticles/ArticlesAPI.py +++ b/wechatarticles/ArticlesAPI.py @@ -9,14 +9,16 @@ class ArticlesAPI(object): 整合ArticlesInfo和ArticlesInfo, 方便调用 """ - def __init__(self, - username=None, - password=None, - official_cookie=None, - token=None, - appmsg_token=None, - wechat_cookie=None, - outfile=None): + def __init__( + self, + username=None, + password=None, + official_cookie=None, + token=None, + appmsg_token=None, + wechat_cookie=None, + outfile=None, + ): """ 初始化参数 Parameters @@ -47,9 +49,9 @@ def __init__(self, raise SystemError("please check your paramse") # 支持两种方式, mitmproxy自动获取参数和手动获取参数 - if (appmsg_token == None) and (wechat_cookie == None) and (outfile != - None): + if (appmsg_token == None) and (wechat_cookie == None) and (outfile != None): from .ReadOutfile import Reader + reader = Reader() reader.contral(outfile) self.appmsg_token, self.cookie = reader.request(outfile) @@ -83,38 +85,38 @@ def complete_info(self, nickname, begin=0, count=5): 'comments': 文章评论信息 { "base_resp": { - "errmsg": "ok", + "errmsg": "ok", "ret": 0 - }, + }, "elected_comment": [ { - "content": 用户评论文字, - "content_id": "6846263421277569047", - "create_time": 1520098511, - "id": 3, - "is_from_friend": 0, - "is_from_me": 0, + "content": 用户评论文字, + "content_id": "6846263421277569047", + "create_time": 1520098511, + "id": 3, + "is_from_friend": 0, + "is_from_me": 0, "is_top": 0, 是否被置顶 - "like_id": 10001, - "like_num": 3, - "like_status": 0, - "logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132", - "my_id": 23, - "nick_name": 评论用户的名字, + "like_id": 10001, + "like_num": 3, + "like_status": 0, + "logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132", + "my_id": 23, + "nick_name": 评论用户的名字, "reply": { "reply_list": [ ] } } - ], + ], "elected_comment_total_cnt": 3, 评论总数 - "enabled": 1, - "friend_comment": [ ], - "is_fans": 1, - "logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132", - "my_comment": [ ], - "nick_name": 当前用户名, + "enabled": 1, + "friend_comment": [ ], + "is_fans": 1, + "logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132", + "my_comment": [ ], + "nick_name": 当前用户名, "only_fans_can_comment": false - }, + }, 'cover': 封面的url'digest': 文章摘要, 'itemidx': 1, 'like_num': 18, 文章点赞数 @@ -128,7 +130,8 @@ def complete_info(self, nickname, begin=0, count=5): """ # 获取文章数据 artiacle_data = self.officical.articles( - nickname, begin=str(begin), count=str(count)) + nickname, begin=str(begin), count=str(count) + ) # 提取每个文章的url,获取文章的点赞、阅读、评论信息,并加入到原来的json中 for data in artiacle_data: @@ -136,7 +139,7 @@ def complete_info(self, nickname, begin=0, count=5): comments = self.wechat.comments(article_url) read_like_nums = self.wechat.read_like_nums(article_url) data["comments"] = comments - data["read_num"], data["like_num"], data['old_like_num'] = read_like_nums + data["read_num"], data["like_num"], data["old_like_num"] = read_like_nums return artiacle_data @@ -147,7 +150,7 @@ def __extract_info(self, articles_data): comments = self.wechat.comments(article_url) read_like_nums = self.wechat.read_like_nums(article_url) data["comments"] = comments - data["read_num"], data["like_num"], data['old_like_num'] = read_like_nums + data["read_num"], data["like_num"], data["old_like_num"] = read_like_nums return articles_data @@ -172,38 +175,38 @@ def continue_info(self, nickname, begin=0): 'comments': 文章评论信息 { "base_resp": { - "errmsg": "ok", + "errmsg": "ok", "ret": 0 - }, + }, "elected_comment": [ { - "content": 用户评论文字, - "content_id": "6846263421277569047", - "create_time": 1520098511, - "id": 3, - "is_from_friend": 0, - "is_from_me": 0, + "content": 用户评论文字, + "content_id": "6846263421277569047", + "create_time": 1520098511, + "id": 3, + "is_from_friend": 0, + "is_from_me": 0, "is_top": 0, 是否被置顶 - "like_id": 10001, - "like_num": 3, - "like_status": 0, - "logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132", - "my_id": 23, - "nick_name": 评论用户的名字, + "like_id": 10001, + "like_num": 3, + "like_status": 0, + "logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132", + "my_id": 23, + "nick_name": 评论用户的名字, "reply": { "reply_list": [ ] } } - ], + ], "elected_comment_total_cnt": 3, 评论总数 - "enabled": 1, - "friend_comment": [ ], - "is_fans": 1, - "logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132", - "my_comment": [ ], - "nick_name": 当前用户名, + "enabled": 1, + "friend_comment": [ ], + "is_fans": 1, + "logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132", + "my_comment": [ ], + "nick_name": 当前用户名, "only_fans_can_comment": false - }, + }, 'cover': 封面的url'digest': 文章摘要, 'itemidx': 1, 'like_num': 18, 文章点赞数 @@ -222,7 +225,9 @@ def continue_info(self, nickname, begin=0): # 获取文章数据 artiacle_datas.append( self.officical.articles( - nickname, begin=str(begin), count=str(count))) + nickname, begin=str(begin), count=str(count) + ) + ) except Exception as e: print(e) break @@ -232,6 +237,7 @@ def continue_info(self, nickname, begin=0): def flatten(x): return [y for l in x for y in flatten(l)] if type(x) is list else [x] + # flatten = lambda x: [y for l in x for y in flatten(l)] if type(x) is list else [x] print("第{}篇文章爬取失败,请过段时间再次尝试或换个帐号继续爬取".format(begin)) return self.__extract_info(flatten(artiacle_datas)) diff --git a/wechatarticles/ArticlesInfo.py b/wechatarticles/ArticlesInfo.py index a394176..1cdca33 100755 --- a/wechatarticles/ArticlesInfo.py +++ b/wechatarticles/ArticlesInfo.py @@ -8,13 +8,8 @@ class ArticlesInfo(object): """ 登录WeChat,获取更加详细的推文信息。如点赞数、阅读数、评论等 """ - def __init__(self, - appmsg_token, - cookie, - proxies={ - 'http': None, - 'https': None - }): + + def __init__(self, appmsg_token, cookie, proxies={"http": None, "https": None}): """ 初始化参数 Parameters @@ -32,9 +27,8 @@ def __init__(self, self.s.trust_env = False self.appmsg_token = appmsg_token self.headers = { - "User-Agent": - "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", - "Cookie": cookie + "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", + "Cookie": cookie, } self.data = { "is_only_read": "1", @@ -58,8 +52,7 @@ def __verify_url(self, article_url): verify_lst = ["mp.weixin.qq.com", "__biz", "mid", "sn", "idx"] for string in verify_lst: if string not in article_url: - raise Exception( - "params is error, please check your article_url") + raise Exception("params is error, please check your article_url") def read_like_nums(self, article_url): """ @@ -76,8 +69,11 @@ def read_like_nums(self, article_url): """ try: appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"] - return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat[ - "old_like_num"] + return ( + appmsgstat["read_num"], + appmsgstat["like_num"], + appmsgstat["old_like_num"], + ) except Exception: raise Exception("params is error, please check your article_url") @@ -130,11 +126,10 @@ def comments(self, article_url): __biz, _, idx, _ = self.__get_params(article_url) getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100" try: - url = getcomment_url.format(__biz, idx, - self.__get_comment_id(article_url)) - comment_json = self.s.get(url, - headers=self.headers, - proxies=self.proxies).json() + url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url)) + comment_json = self.s.get( + url, headers=self.headers, proxies=self.proxies + ).json() except Exception as e: print(e) comment_json = {} @@ -155,8 +150,7 @@ def __get_comment_id(self, article_url): """ res = self.s.get(article_url, data=self.data, proxies=self.proxies) # 使用正则提取comment_id - comment_id = re.findall(r'comment_id = "\d+"', - res.text)[0].split(" ")[-1][1:-1] + comment_id = re.findall(r'comment_id = "\d+"', res.text)[0].split(" ")[-1][1:-1] return comment_id def __get_params(self, article_url): @@ -177,7 +171,7 @@ def __get_params(self, article_url): # 切分url, 提取相应的参数 string_lst = article_url.split("?")[1].split("&") - dict_value = [string[string.index("=") + 1:] for string in string_lst] + dict_value = [string[string.index("=") + 1 :] for string in string_lst] __biz, mid, idx, sn, *_ = dict_value sn = sn[:-3] if sn[-3] == "#" else sn @@ -213,8 +207,7 @@ def __get_appmsgext(self, article_url): # 将params参数换到data中请求。这一步貌似不换也行 origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?" - appmsgext_url = origin_url + "appmsg_token={}&x5=0".format( - self.appmsg_token) + appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token) self.data["__biz"] = __biz self.data["mid"] = mid self.data["sn"] = sn @@ -222,12 +215,10 @@ def __get_appmsgext(self, article_url): # appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format( # __biz, mid, sn, idx, self.appmsg_token) - appmsgext_json = requests.post(appmsgext_url, - headers=self.headers, - data=self.data, - proxies=self.proxies).json() + appmsgext_json = requests.post( + appmsgext_url, headers=self.headers, data=self.data, proxies=self.proxies + ).json() if "appmsgstat" not in appmsgext_json.keys(): - raise Exception( - "get info error, please check your cookie and appmsg_token") + raise Exception("get info error, please check your cookie and appmsg_token") return appmsgext_json diff --git a/wechatarticles/ArticlesUrls.py b/wechatarticles/ArticlesUrls.py index 6baa1e0..ae1ded8 100755 --- a/wechatarticles/ArticlesUrls.py +++ b/wechatarticles/ArticlesUrls.py @@ -10,7 +10,8 @@ class ArticlesUrls(object): """ 获取需要爬取的微信公众号的推文链接 """ - def __init__(self, cookie, token, proxies={'http': None, 'https': None}): + + def __init__(self, cookie, token, proxies={"http": None, "https": None}): """ 初始化参数 Parameters @@ -26,8 +27,7 @@ def __init__(self, cookie, token, proxies={'http': None, 'https': None}): """ self.s = requests.session() self.headers = { - "User-Agent": - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36" } self.params = { "lang": "zh_CN", @@ -71,6 +71,7 @@ def __save_login_qrcode(self, img): """ import matplotlib.pyplot as plt from PIL import Image + # 存储二维码 with open("login.png", "wb+") as fp: fp.write(img.content) @@ -95,18 +96,18 @@ def __save_cookie(self, username): ------- None """ - #实例化一个LWPcookiejar对象 - new_cookie_jar = cookielib.LWPCookieJar(username + '.txt') + # 实例化一个LWPcookiejar对象 + new_cookie_jar = cookielib.LWPCookieJar(username + ".txt") - #将转换成字典格式的RequestsCookieJar(这里我用字典推导手动转的)保存到LWPcookiejar中 + # 将转换成字典格式的RequestsCookieJar(这里我用字典推导手动转的)保存到LWPcookiejar中 requests.utils.cookiejar_from_dict( - {c.name: c.value - for c in self.s.cookies}, new_cookie_jar) + {c.name: c.value for c in self.s.cookies}, new_cookie_jar + ) - #保存到本地文件 - new_cookie_jar.save('cookies/' + username + '.txt', - ignore_discard=True, - ignore_expires=True) + # 保存到本地文件 + new_cookie_jar.save( + "cookies/" + username + ".txt", ignore_discard=True, ignore_expires=True + ) def __read_cookie(self, username): """ @@ -120,15 +121,15 @@ def __read_cookie(self, username): ------- None """ - #实例化一个LWPCookieJar对象 + # 实例化一个LWPCookieJar对象 load_cookiejar = cookielib.LWPCookieJar() - #从文件中加载cookies(LWP格式) - load_cookiejar.load('cookies/' + username + '.txt', - ignore_discard=True, - ignore_expires=True) - #工具方法转换成字典 + # 从文件中加载cookies(LWP格式) + load_cookiejar.load( + "cookies/" + username + ".txt", ignore_discard=True, ignore_expires=True + ) + # 工具方法转换成字典 load_cookies = requests.utils.dict_from_cookiejar(load_cookiejar) - #工具方法将字典转换成RequestsCookieJar,赋值给session的cookies. + # 工具方法将字典转换成RequestsCookieJar,赋值给session的cookies. self.s.cookies = requests.utils.cookiejar_from_dict(load_cookies) def __md5_passwd(self, password): @@ -145,7 +146,7 @@ def __md5_passwd(self, password): 加密后的字符串 """ m5 = hashlib.md5() - m5.update(password.encode('utf-8')) + m5.update(password.encode("utf-8")) pwd = m5.hexdigest() return pwd @@ -172,7 +173,7 @@ def __startlogin_official(self, username, password): "lang": "zh_CN", "imgcode": "", "f": "json", - "ajax": "1" + "ajax": "1", } # 增加headers的keys @@ -211,7 +212,8 @@ def __login_official(self, username, password): """ # 设定headers的referer的请求 referer = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN&account={}".format( - username) + username + ) self.headers["Referer"] = referer # 获取token的data @@ -224,10 +226,9 @@ def __login_official(self, username, password): } # 获取token的url bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login" - res = self.s.post(bizlogin_url, - data=data, - headers=self.headers, - proxies=self.proxies).json() + res = self.s.post( + bizlogin_url, data=data, headers=self.headers, proxies=self.proxies + ).json() try: # 截取字符串中的token参数 @@ -277,16 +278,18 @@ def official_info(self, nickname, begin=0, count=5): "count": str(count), "action": "search_biz", "ajax": "1", - "begin": str(begin) + "begin": str(begin), } self.params.update(params) try: # 返回与输入公众号名称最接近的公众号信息 - official = self.s.get(search_url, - headers=self.headers, - params=self.params, - proxies=self.proxies) + official = self.s.get( + search_url, + headers=self.headers, + params=self.params, + proxies=self.proxies, + ) return official.json()["list"] except Exception: raise Exception(u"公众号名称错误或cookie、token错误,请重新输入") @@ -341,9 +344,9 @@ def articles(self, nickname, begin=0, count=5): """ self.__verify_str(nickname, "nickname") try: - return self.__get_articles_data(nickname, - begin=str(begin), - count=str(count))["app_msg_list"] + return self.__get_articles_data( + nickname, begin=str(begin), count=str(count) + )["app_msg_list"] except Exception: raise Exception(u"公众号名称错误或cookie、token错误,请重新输入") @@ -373,19 +376,20 @@ def lastest_articles(self, biz): 如果list为空则说明没有相关文章 """ try: - return self.__get_articles_data("", begin="0", - biz=biz)["app_msg_list"] + return self.__get_articles_data("", begin="0", biz=biz)["app_msg_list"] except Exception: raise Exception(u"公众号名称错误或cookie、token错误,请重新输入") - def __get_articles_data(self, - nickname, - begin, - biz=None, - count=5, - type_="9", - action="list_ex", - query=None): + def __get_articles_data( + self, + nickname, + begin, + biz=None, + count=5, + type_="9", + action="list_ex", + query=None, + ): """ 获取公众号文章的一些信息 Parameters @@ -436,12 +440,11 @@ def __get_articles_data(self, "begin": str(begin), "count": str(count), "type": str(type_), - "action": action + "action": action, } self.params.update(params) - data = self.s.get(appmsg_url, - headers=self.headers, - params=self.params, - proxies=self.proxies) + data = self.s.get( + appmsg_url, headers=self.headers, params=self.params, proxies=self.proxies + ) return data.json() diff --git a/wechatarticles/GetUrls.py b/wechatarticles/GetUrls.py index 16cbb27..66f95cb 100644 --- a/wechatarticles/GetUrls.py +++ b/wechatarticles/GetUrls.py @@ -6,14 +6,8 @@ class PCUrls(object): """ 通过PC端的wechat,获取需要爬取的微信公众号的推文链接 """ - def __init__(self, - biz, - uin, - cookie, - proxies={ - 'http': None, - 'https': None - }): + + def __init__(self, biz, uin, cookie, proxies={"http": None, "https": None}): """ 初始化参数 Parameters @@ -31,10 +25,10 @@ def __init__(self, self.s = requests.session() self.__biz = biz self.uin = uin - self.headers = {'Cookies': cookie} + self.headers = {"Cookies": cookie} self.proxies = proxies - def get_urls(self, key, offset='0'): + def get_urls(self, key, offset="0"): """ 获取urls Parameters @@ -84,29 +78,29 @@ def get_urls(self, key, offset='0'): ] """ self.params = { - 'action': 'getmsg', - '__biz': self.__biz, - 'f': 'json', - 'offset': str(offset), - 'count': '10', - 'uin': self.uin, - 'key': key, + "action": "getmsg", + "__biz": self.__biz, + "f": "json", + "offset": str(offset), + "count": "10", + "uin": self.uin, + "key": key, } - origin_url = 'https://mp.weixin.qq.com/mp/profile_ext' + origin_url = "https://mp.weixin.qq.com/mp/profile_ext" - msg_json = self.s.get(origin_url, - params=self.params, - headers=self.headers, - proxies=self.proxies).json() - if 'general_msg_list' in msg_json.keys(): + msg_json = self.s.get( + origin_url, params=self.params, headers=self.headers, proxies=self.proxies + ).json() + if "general_msg_list" in msg_json.keys(): lst = [ - item for item in eval(msg_json['general_msg_list'])['list'] - if 'app_msg_ext_info' in item.keys() + item + for item in eval(msg_json["general_msg_list"])["list"] + if "app_msg_ext_info" in item.keys() ] return lst raise Exception( - 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key' + "Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key" ) @@ -114,6 +108,7 @@ class MobileUrls(object): """ 通过移动端的wechat,获取需要爬取的微信公众号的推文链接 """ + def __init__(self, biz, cookie): """ 初始化参数 @@ -130,12 +125,11 @@ def __init__(self, biz, cookie): self.s = requests.session() self.__biz = biz self.headers = { - 'User-Agent': - 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile', - 'Cookie': cookie + "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", + "Cookie": cookie, } - def get_urls(self, appmsg_token, offset='0'): + def get_urls(self, appmsg_token, offset="0"): """ 获取urls Parameters @@ -185,26 +179,26 @@ def get_urls(self, appmsg_token, offset='0'): ] """ self.params = { - 'action': 'getmsg', - '__biz': self.__biz, - 'f': 'json', - 'offset': str(offset), - 'count': '10', - 'appmsg_token': appmsg_token, + "action": "getmsg", + "__biz": self.__biz, + "f": "json", + "offset": str(offset), + "count": "10", + "appmsg_token": appmsg_token, } - origin_url = 'https://mp.weixin.qq.com/mp/profile_ext' + origin_url = "https://mp.weixin.qq.com/mp/profile_ext" - msg_json = self.s.get(origin_url, - params=self.params, - headers=self.headers, - proxies=self.proxies).json() - if 'general_msg_list' in msg_json.keys(): + msg_json = self.s.get( + origin_url, params=self.params, headers=self.headers, proxies=self.proxies + ).json() + if "general_msg_list" in msg_json.keys(): lst = [ - item for item in eval(msg_json['general_msg_list'])['list'] - if 'app_msg_ext_info' in item.keys() + item + for item in eval(msg_json["general_msg_list"])["list"] + if "app_msg_ext_info" in item.keys() ] return lst raise Exception( - 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key' + "Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key" ) diff --git a/wechatarticles/Url2Html.py b/wechatarticles/Url2Html.py index 847c925..79524ae 100644 --- a/wechatarticles/Url2Html.py +++ b/wechatarticles/Url2Html.py @@ -8,6 +8,7 @@ class Url2Html(object): """根据微信文章链接下载为本地HTML文件""" + def __init__(self, img_path=None): """ img_path: 本地存储图片的路径,采用绝对路径的方式引用图片。可不下载图片 @@ -23,7 +24,7 @@ def replace_name(self, title): title: 文章标题 """ rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' - title = re.sub(rstr, "", title).replace('|', '').replace('\n', '') + title = re.sub(rstr, "", title).replace("|", "").replace("\n", "") return title def download_img(self, url): @@ -32,19 +33,17 @@ def download_img(self, url): url: 图片链接 """ # 根据链接提取图片名 - name = '{}.{}'.format( - url.split('/')[-2], - url.split('/')[3].split('_')[-1]) + name = "{}.{}".format(url.split("/")[-2], url.split("/")[3].split("_")[-1]) imgpath = os.path.join(self.img_path, name) # 如果该图片已被下载,可以无需再下载,直接返回路径即可 if os.path.isfile(imgpath): - with open(imgpath, 'rb') as f: + with open(imgpath, "rb") as f: img = f.read() return imgpath, img response = requests.get(url, proxies=self.proxies) img = response.content - with open(imgpath, 'wb') as f: + with open(imgpath, "wb") as f: f.write(img) imgpath = os.path.basename(self.img_path) return os.path.join(imgpath, name), img @@ -61,13 +60,12 @@ def replace_img(self, html): img_url_lst = data_croporisrc_lst + data_src_lst + src_lst img_lst = [] for img_url in img_url_lst: - if 'mmbiz.qpic.cn' in img_url: + if "mmbiz.qpic.cn" in img_url: # print(img_url) data_src, img = self.download_img(img_url) img_lst.append([data_src, img]) # 以绝对路径的方式替换图片 - html = html.replace(img_url, - data_src).replace('data-src=', 'src=') + html = html.replace(img_url, data_src).replace("data-src=", "src=") return html, img_lst def get_title(self, html): @@ -77,21 +75,24 @@ def get_title(self, html): """ try: # title = html.split('activity-name">')[1].split('')[1].strip() + title = html.split("")[1].strip() return title except Exception as e: print(e) - print(html.split('' - )[1].split('')[1].split('')[1] + .split("')[1].split("?', s.text) + r'?', s.text + ) if biz_lst != []: nicknames_lst = re.findall( - r'(.+)', s.text) + r'(.+)', s.text + ) tmp = self.biz_name.format(biz_lst[0], nicknames_lst[0]) self.res_lst.append(tmp) time.sleep(self.t) @@ -120,12 +122,12 @@ def qingbo(self, nickname_lst): return self.res_lst -if __name__ == '__main__': - nickname_lst = ['科技美学', 'AppSo', 'InfoQ'] - cookie = '' - nb = nickname2biz(cookie, method='xigua', t=10) +if __name__ == "__main__": + nickname_lst = ["科技美学", "AppSo", "InfoQ"] + cookie = "" + nb = nickname2biz(cookie, method="xigua", t=10) res_lst = nb.run(nickname_lst) - fname = '1.txt' - with open(fname, 'w', encoding='utf-8') as f: - f.write('\n'.join(item for item in res_lst)) + fname = "1.txt" + with open(fname, "w", encoding="utf-8") as f: + f.write("\n".join(item for item in res_lst)) diff --git a/wechatarticles/tools.py b/wechatarticles/tools.py index 905ed35..f2c9383 100644 --- a/wechatarticles/tools.py +++ b/wechatarticles/tools.py @@ -22,13 +22,10 @@ def timestamp2date(timestamp): datetime = time.strftime("%Y-%m-%d %H:%M:%S", time_array) return datetime -def save_mongo(data, - host=None, - port=None, - name=None, - password="", - dbname=None, - collname=None): + +def save_mongo( + data, host=None, port=None, name=None, password="", dbname=None, collname=None +): """ 存储数据到mongo Parameters @@ -61,13 +58,14 @@ def save_mongo(data, assert isinstance(host, str) assert isinstance(name, str) assert isinstance(password, str) - assert isinstance(dbname, str) + assert isinstance(dbname, str) assert isinstance(collname, str) if not isinstance(port, int): raise TypeError("port must be an instance of int") from pymongo import MongoClient + # 连接数据库,一次性插入数据 client = MongoClient(host, port) db_auth = client.admin @@ -75,6 +73,7 @@ def save_mongo(data, coll = client[dbname][collname] coll.insert_many(data) + def save_json(fname, data): """ 保存数据为txt格式 diff --git a/wechatarticles/utils.py b/wechatarticles/utils.py index 8259543..07f86a0 100644 --- a/wechatarticles/utils.py +++ b/wechatarticles/utils.py @@ -2,6 +2,7 @@ """ 辅助脚本函数 """ +import base64 import html import json import os @@ -13,10 +14,10 @@ from .GetUrls import PCUrls -base_columns = ['url', 'title', 'date', 'headlines', 'copyright'] -A_columns = ['read_num', 'old_like_num', 'like_num'] -B_columns = ['comments_num', 'comments_content', 'comments_like_num'] -C_columns = ['content', 'content_num', 'pic_num'] +base_columns = ["url", "title", "date", "headlines", "copyright"] +A_columns = ["read_num", "old_like_num", "like_num"] +B_columns = ["comments_num", "comments_content", "comments_like_num"] +C_columns = ["content", "content_num", "pic_num"] mode_columns = { 1: A_columns, 2: B_columns, @@ -24,10 +25,10 @@ 4: A_columns + B_columns, 5: A_columns + C_columns, 6: B_columns + C_columns, - 7: A_columns + B_columns + C_columns + 7: A_columns + B_columns + C_columns, } -ctext = '你的访问过于频繁,需要从微信打开验证身份,是否需要继续访问当前页面' +ctext = "你的访问过于频繁,需要从微信打开验证身份,是否需要继续访问当前页面" # url, readnum likenum @@ -38,7 +39,7 @@ def flatten(x): def remove_duplicate_json(fname): # 删除json中重复的数据 # fname: xxx.json - with open(fname, 'r', encoding='utf-8') as f: + with open(fname, "r", encoding="utf-8") as f: data = f.readlines() id_re = re.compile(r'datetime": (.+), "fakeid"') @@ -51,7 +52,7 @@ def remove_duplicate_json(fname): # key=lambda line: re.findall( # r'datetime": (.+), "fakeid"', line)[0])[::-1] - with open(fname, 'w', encoding='utf-8') as f: + with open(fname, "w", encoding="utf-8") as f: f.writelines(sort_data) @@ -64,15 +65,15 @@ def end_func(timestamp, end_timestamp): def transfer_url(url): url = html.unescape(html.unescape(url)) - return eval(repr(url).replace('\\', '')) + return eval(repr(url).replace("\\", "")) def save_f(fname): i = 1 while True: - if os.path.isfile('{}.json'.format(fname)): + if os.path.isfile("{}.json".format(fname)): i += 1 - fname += '-' + str(i) + fname += "-" + str(i) else: break @@ -92,13 +93,12 @@ def verify_url(article_url): def get_content(url, cookie): headers = { - 'user-agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36', - 'cookie': cookie + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36", + "cookie": cookie, } html_text = requests.get(url.strip(), headers=headers).text - soup = bs(html_text, 'lxml') + soup = bs(html_text, "lxml") if ctext in html_text: assert 1 == 2 # js加载 @@ -108,16 +108,15 @@ def get_content(url, cookie): body = soup.find(class_="rich_media_area_primary_inner") content_p = body.find(class_="rich_media_content") if content_p: - imgs = body.find_all('img') - return content_p.text.strip(), len( - content_p.text.strip()), len(imgs) + imgs = body.find_all("img") + return content_p.text.strip(), len(content_p.text.strip()), len(imgs) else: content_p = soup.find(id="js_panel_like_title").text.strip() return content_p, len(content_p), 0 # with open(txt_name, 'w', encoding='utf-8') as f: # f.write(content_p.text) except: - return '', 0, 0 + return "", 0, 0 def copyright_num(copyright_stat): @@ -147,19 +146,15 @@ def copyright_num_detailed(copyright_stat): def read_nickname(fname): # 读取数据 - with open(fname, 'r', encoding='utf-8') as f: + with open(fname, "r", encoding="utf-8") as f: haved_data = f.readlines() - return [line.split(', ') for line in haved_data] + return [line.split(", ") for line in haved_data] -def get_history_urls(biz, - uin, - key, - lst=[], - start_timestamp=0, - count=10, - endcount=99999): - t = PCUrls(biz=biz, uin=uin, cookie='') +def get_history_urls( + biz, uin, key, lst=[], start_timestamp=0, count=10, endcount=99999 +): + t = PCUrls(biz=biz, uin=uin, cookie="") try: while True: res = t.get_urls(key, offset=count) @@ -173,7 +168,7 @@ def get_history_urls(biz, break time.sleep(5) except KeyboardInterrupt as e: - print('程序手动中断') + print("程序手动中断") return lst except Exception as e: print(e) @@ -181,3 +176,11 @@ def get_history_urls(biz, assert 1 == 2 finally: return lst + + +def swap_biz_id(biz=None, fakeid=None): + if biz == None: + return str(base64.b64encode(fakeid.encode()), encoding="utf-8") + if fakeid == None: + return str(base64.b64decode(biz.encode()), encoding="utf-8") + return None