From 9ee82031bab49cae0e0cbc7cd4c41008bf38d074 Mon Sep 17 00:00:00 2001 From: wnma3mz Date: Tue, 13 Oct 2020 11:18:49 +0800 Subject: [PATCH] add proxy --- setup.py | 2 +- wechatarticles/ArticlesInfo.py | 35 +++++++++++++++--------- wechatarticles/ArticlesUrls.py | 38 +++++++++++--------------- wechatarticles/GetUrls.py | 49 ++++++++++++++++++++++------------ wechatarticles/Url2Html.py | 20 +++++++------- wechatarticles/nickname2biz.py | 23 +++++++++++++--- 6 files changed, 102 insertions(+), 65 deletions(-) diff --git a/setup.py b/setup.py index 5adfb12..62f947f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setuptools.setup( name="wechatarticles", - version="0.5.2", + version="0.5.3", author="wnma3mz", author_email="wnma3mz@gmail.com", description="wechat articles scrapy", diff --git a/wechatarticles/ArticlesInfo.py b/wechatarticles/ArticlesInfo.py index abcf9e0..e9d081c 100755 --- a/wechatarticles/ArticlesInfo.py +++ b/wechatarticles/ArticlesInfo.py @@ -8,8 +8,13 @@ class ArticlesInfo(object): """ 登录WeChat,获取更加详细的推文信息。如点赞数、阅读数、评论等 """ - - def __init__(self, appmsg_token, cookie): + def __init__(self, + appmsg_token, + cookie, + proxies={ + 'http': None, + 'https': None + }): """ 初始化参数 Parameters @@ -28,14 +33,14 @@ def __init__(self, appmsg_token, cookie): self.headers = { "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", - "Cookie": - cookie + "Cookie": cookie } self.data = { "is_only_read": "1", "is_temp_url": "0", - "appmsg_type": "9", # 新参数,不加入无法获取like_num + "appmsg_type": "9", # 新参数,不加入无法获取like_num } + self.proxies = proxies def __verify_url(self, article_url): """ @@ -70,7 +75,8 @@ def read_like_nums(self, article_url): """ try: appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"] - return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat["old_like_num"] + return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat[ + "old_like_num"] except Exception: raise Exception("params is error, please check your article_url") @@ -123,8 +129,11 @@ def comments(self, article_url): __biz, _, idx, _ = self.__get_params(article_url) getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100" try: - url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url)) - comment_json = self.s.get(url, headers=self.headers).json() + url = getcomment_url.format(__biz, idx, + self.__get_comment_id(article_url)) + comment_json = self.s.get(url, + headers=self.headers, + proxies=self.proxies).json() except Exception as e: print(e) comment_json = {} @@ -143,7 +152,7 @@ def __get_comment_id(self, article_url): str: comment_id获取评论必要参数 """ - res = self.s.get(article_url, data=self.data) + res = self.s.get(article_url, data=self.data, proxies=self.proxies) # 使用正则提取comment_id comment_id = re.findall(r'comment_id = "\d+"', res.text)[0].split(" ")[-1][1:-1] @@ -203,7 +212,8 @@ def __get_appmsgext(self, article_url): # 将params参数换到data中请求。这一步貌似不换也行 origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?" - appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token) + appmsgext_url = origin_url + "appmsg_token={}&x5=0".format( + self.appmsg_token) self.data["__biz"] = __biz self.data["mid"] = mid self.data["sn"] = sn @@ -211,8 +221,9 @@ def __get_appmsgext(self, article_url): # appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format( # __biz, mid, sn, idx, self.appmsg_token) - appmsgext_json = requests.post( - appmsgext_url, headers=self.headers, data=self.data).json() + appmsgext_json = requests.post(appmsgext_url, + headers=self.headers, + data=self.data).json() if "appmsgstat" not in appmsgext_json.keys(): raise Exception( diff --git a/wechatarticles/ArticlesUrls.py b/wechatarticles/ArticlesUrls.py index e9c899e..6baa1e0 100755 --- a/wechatarticles/ArticlesUrls.py +++ b/wechatarticles/ArticlesUrls.py @@ -10,15 +10,11 @@ class ArticlesUrls(object): """ 获取需要爬取的微信公众号的推文链接 """ - def __init__(self, username=None, password=None, cookie=None, token=None): + def __init__(self, cookie, token, proxies={'http': None, 'https': None}): """ 初始化参数 Parameters ---------- - username: str - 用户账号 - password: str - 用户密码 token : str 登录微信公众号平台之后获取的token cookie : str @@ -39,20 +35,11 @@ def __init__(self, username=None, password=None, cookie=None, token=None): } # 手动输入cookie和token登录 - if (cookie != None) and (token != None): - self.__verify_str(cookie, "cookie") - self.__verify_str(token, "token") - self.headers["Cookie"] = cookie - self.params["token"] = token - # 扫描二维码登录 - elif (username != None) and (password != None): - self.__verify_str(username, "username") - self.__verify_str(password, "password") - # 暂不支持cookie缓存 - self.__startlogin_official(username, password) - else: - print("please check your paramse") - raise SystemError + self.__verify_str(cookie, "cookie") + self.__verify_str(token, "token") + self.headers["Cookie"] = cookie + self.params["token"] = token + self.proxies = proxies def __verify_str(self, input_string, param_name): """ @@ -237,7 +224,10 @@ def __login_official(self, username, password): } # 获取token的url bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login" - res = self.s.post(bizlogin_url, data=data, headers=self.headers).json() + res = self.s.post(bizlogin_url, + data=data, + headers=self.headers, + proxies=self.proxies).json() try: # 截取字符串中的token参数 @@ -295,7 +285,8 @@ def official_info(self, nickname, begin=0, count=5): # 返回与输入公众号名称最接近的公众号信息 official = self.s.get(search_url, headers=self.headers, - params=self.params) + params=self.params, + proxies=self.proxies) return official.json()["list"] except Exception: raise Exception(u"公众号名称错误或cookie、token错误,请重新输入") @@ -449,5 +440,8 @@ def __get_articles_data(self, } self.params.update(params) - data = self.s.get(appmsg_url, headers=self.headers, params=self.params) + data = self.s.get(appmsg_url, + headers=self.headers, + params=self.params, + proxies=self.proxies) return data.json() diff --git a/wechatarticles/GetUrls.py b/wechatarticles/GetUrls.py index 1a63dc3..16cbb27 100644 --- a/wechatarticles/GetUrls.py +++ b/wechatarticles/GetUrls.py @@ -6,8 +6,14 @@ class PCUrls(object): """ 通过PC端的wechat,获取需要爬取的微信公众号的推文链接 """ - - def __init__(self, biz, uin, cookie): + def __init__(self, + biz, + uin, + cookie, + proxies={ + 'http': None, + 'https': None + }): """ 初始化参数 Parameters @@ -25,9 +31,8 @@ def __init__(self, biz, uin, cookie): self.s = requests.session() self.__biz = biz self.uin = uin - self.headers = { - 'Cookies': cookie - } + self.headers = {'Cookies': cookie} + self.proxies = proxies def get_urls(self, key, offset='0'): """ @@ -89,22 +94,26 @@ def get_urls(self, key, offset='0'): } origin_url = 'https://mp.weixin.qq.com/mp/profile_ext' - msg_json = self.s.get(origin_url, params=self.params, - headers=self.headers).json() + msg_json = self.s.get(origin_url, + params=self.params, + headers=self.headers, + proxies=self.proxies).json() if 'general_msg_list' in msg_json.keys(): - lst = [item for item in eval(msg_json['general_msg_list'])[ - 'list'] if 'app_msg_ext_info' in item.keys()] + lst = [ + item for item in eval(msg_json['general_msg_list'])['list'] + if 'app_msg_ext_info' in item.keys() + ] return lst raise Exception( - 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key') + 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key' + ) class MobileUrls(object): """ 通过移动端的wechat,获取需要爬取的微信公众号的推文链接 """ - def __init__(self, biz, cookie): """ 初始化参数 @@ -121,7 +130,8 @@ def __init__(self, biz, cookie): self.s = requests.session() self.__biz = biz self.headers = { - 'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile', + 'User-Agent': + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile', 'Cookie': cookie } @@ -184,12 +194,17 @@ def get_urls(self, appmsg_token, offset='0'): } origin_url = 'https://mp.weixin.qq.com/mp/profile_ext' - msg_json = self.s.get(origin_url, params=self.params, - headers=self.headers).json() + msg_json = self.s.get(origin_url, + params=self.params, + headers=self.headers, + proxies=self.proxies).json() if 'general_msg_list' in msg_json.keys(): - lst = [item for item in eval(msg_json['general_msg_list'])[ - 'list'] if 'app_msg_ext_info' in item.keys()] + lst = [ + item for item in eval(msg_json['general_msg_list'])['list'] + if 'app_msg_ext_info' in item.keys() + ] return lst raise Exception( - 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key') + 'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key' + ) diff --git a/wechatarticles/Url2Html.py b/wechatarticles/Url2Html.py index 950e270..4031a01 100644 --- a/wechatarticles/Url2Html.py +++ b/wechatarticles/Url2Html.py @@ -42,7 +42,7 @@ def download_img(self, url): img = f.read() return imgpath, img - response = requests.get(url) + response = requests.get(url, proxies=self.proxies) img = response.content with open(imgpath, 'wb') as f: f.write(img) @@ -140,7 +140,7 @@ def rename_title(self, title, html): '[{}]-{}-{}'.format(account_name, date, title)) return title - def run(self, url, mode, **kwargs): + def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs): """ 启动函数 url: 微信文章链接 @@ -156,17 +156,18 @@ def run(self, url, mode, **kwargs): proxies: 代理 img_path: 图片下载路径 """ + proxies = self.proxies if mode == 1: - return requests.get(url).text + return requests.get(url, proxies=proxies).text elif mode in [2, 3, 4]: if 'img_path' in kwargs.keys(): self.img_path = kwargs['img_path'] else: return '{} 请输入保存图片路径!'.format(url) if mode == 2: - return requests.get(url).text + return requests.get(url, proxies=proxies).text elif mode == 3: - html = requests.get(url).text + html = requests.get(url, proxies=proxies).text html_img, _ = self.replace_img(html) return html_img else: @@ -175,9 +176,9 @@ def run(self, url, mode, **kwargs): else: return '{} 请输入保存图片路径!'.format(url) if mode == 2: - return requests.get(url).text + return requests.get(url, proxies=proxies).text elif mode == 3: - html = requests.get(url).text + html = requests.get(url, proxies=proxies).text html_img, _ = self.replace_img(html) return html_img else: @@ -210,7 +211,8 @@ def run(self, url, mode, **kwargs): title = self.rename_title(title, html) html_img, _ = self.replace_img(html) - with open('{}.html'.format(title), 'w', encoding='utf-8') as f: + with open('{}.html'.format(title), 'w', + encoding='utf-8') as f: f.write(html_img) return '{} success!'.format(url) else: @@ -226,4 +228,4 @@ def run(self, url, mode, **kwargs): uh = Url2Html() for url in url_lst: s = uh.run(url, mode=4, img_path='D:\\imgs') - print(s) \ No newline at end of file + print(s) diff --git a/wechatarticles/nickname2biz.py b/wechatarticles/nickname2biz.py index 27514e9..d4c8b8f 100644 --- a/wechatarticles/nickname2biz.py +++ b/wechatarticles/nickname2biz.py @@ -11,7 +11,15 @@ class nickname2biz(object): """输入公众号名称转biz""" - def __init__(self, cookie, token=None, method=None, t=120): + def __init__(self, + cookie, + token=None, + method=None, + t=120, + proxies={ + 'http': None, + 'https': None, + }): """ cookie: 平台登录的cookie token: 官方获取时需要token @@ -36,6 +44,7 @@ def __init__(self, cookie, token=None, method=None, t=120): self.token = token self.t = t self.biz_name = '{}, {}' + self.proxies = proxies def run(self, nickname_lst): if self.method == 'xigua': @@ -43,7 +52,9 @@ def run(self, nickname_lst): elif self.method == 'qingbo': return self.qingbo(nickname_lst) else: - s = ArticlesUrls(cookie=self.cookie, token=self.token) + s = ArticlesUrls(cookie=self.cookie, + token=self.token, + proxies=self.proxies) return self.office(s, nickname_lst) def office(self, s, nickname_lst): @@ -67,7 +78,9 @@ def xigua(self, nickname_lst): self.res_lst = [] for nickname in nickname_lst: try: - s = requests.get(url.format(nickname), headers=self.headers) + s = requests.get(url.format(nickname), + headers=self.headers, + proxies=self.proxies) soup = bs(s.text, 'lxml') infos = soup.find_all(class_="number-details") if infos: @@ -90,7 +103,9 @@ def qingbo(self, nickname_lst): self.res_lst = [] for nickname in nickname_lst: try: - s = requests.get(url.format(nickname), headers=self.headers) + s = requests.get(url.format(nickname), + headers=self.headers, + proxies=self.proxies) biz_lst = re.findall( r'?', s.text) if biz_lst != []: