add proxy

wnma3mz · Oct 13, 2020 · 9ee8203 · 9ee8203
1 parent 7ccc88d
commit 9ee8203
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 65 deletions.
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setuptools.setup(
     name="wechatarticles",
-    version="0.5.2",
+    version="0.5.3",
     author="wnma3mz",
     author_email="[email protected]",
     description="wechat articles scrapy",

diff --git a/wechatarticles/ArticlesInfo.py b/wechatarticles/ArticlesInfo.py
@@ -8,8 +8,13 @@ class ArticlesInfo(object):
     """
     登录WeChat，获取更加详细的推文信息。如点赞数、阅读数、评论等
     """
-
-    def __init__(self, appmsg_token, cookie):
+    def __init__(self,
+                 appmsg_token,
+                 cookie,
+                 proxies={
+                     'http': None,
+                     'https': None
+                 }):
         """
         初始化参数
         Parameters
@@ -28,14 +33,14 @@ def __init__(self, appmsg_token, cookie):
         self.headers = {
             "User-Agent":
             "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
-            "Cookie":
-            cookie
+            "Cookie": cookie
         }
         self.data = {
             "is_only_read": "1",
             "is_temp_url": "0",
-            "appmsg_type": "9", # 新参数，不加入无法获取like_num
+            "appmsg_type": "9",  # 新参数，不加入无法获取like_num
         }
+        self.proxies = proxies
 
     def __verify_url(self, article_url):
         """
@@ -70,7 +75,8 @@ def read_like_nums(self, article_url):
         """
         try:
             appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"]
-            return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat["old_like_num"]
+            return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat[
+                "old_like_num"]
         except Exception:
             raise Exception("params is error, please check your article_url")
 
@@ -123,8 +129,11 @@ def comments(self, article_url):
         __biz, _, idx, _ = self.__get_params(article_url)
         getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100"
         try:
-            url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url))
-            comment_json = self.s.get(url, headers=self.headers).json()
+            url = getcomment_url.format(__biz, idx,
+                                        self.__get_comment_id(article_url))
+            comment_json = self.s.get(url,
+                                      headers=self.headers,
+                                      proxies=self.proxies).json()
         except Exception as e:
             print(e)
             comment_json = {}
@@ -143,7 +152,7 @@ def __get_comment_id(self, article_url):
         str:
             comment_id获取评论必要参数
         """
-        res = self.s.get(article_url, data=self.data)
+        res = self.s.get(article_url, data=self.data, proxies=self.proxies)
         # 使用正则提取comment_id
         comment_id = re.findall(r'comment_id = "\d+"',
                                 res.text)[0].split(" ")[-1][1:-1]
@@ -203,16 +212,18 @@ def __get_appmsgext(self, article_url):
 
         # 将params参数换到data中请求。这一步貌似不换也行
         origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?"
-        appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token)
+        appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(
+            self.appmsg_token)
         self.data["__biz"] = __biz
         self.data["mid"] = mid
         self.data["sn"] = sn
         self.data["idx"] = idx
 
         # appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format(
         #     __biz, mid, sn, idx, self.appmsg_token)
-        appmsgext_json = requests.post(
-            appmsgext_url, headers=self.headers, data=self.data).json()
+        appmsgext_json = requests.post(appmsgext_url,
+                                       headers=self.headers,
+                                       data=self.data).json()
 
         if "appmsgstat" not in appmsgext_json.keys():
             raise Exception(

diff --git a/wechatarticles/ArticlesUrls.py b/wechatarticles/ArticlesUrls.py
@@ -10,15 +10,11 @@ class ArticlesUrls(object):
     """
     获取需要爬取的微信公众号的推文链接
     """
-    def __init__(self, username=None, password=None, cookie=None, token=None):
+    def __init__(self, cookie, token, proxies={'http': None, 'https': None}):
         """
         初始化参数
         Parameters
         ----------
-        username: str
-            用户账号
-        password: str
-            用户密码
         token : str
             登录微信公众号平台之后获取的token
         cookie : str
@@ -39,20 +35,11 @@ def __init__(self, username=None, password=None, cookie=None, token=None):
         }
 
         # 手动输入cookie和token登录
-        if (cookie != None) and (token != None):
-            self.__verify_str(cookie, "cookie")
-            self.__verify_str(token, "token")
-            self.headers["Cookie"] = cookie
-            self.params["token"] = token
-        # 扫描二维码登录
-        elif (username != None) and (password != None):
-            self.__verify_str(username, "username")
-            self.__verify_str(password, "password")
-            # 暂不支持cookie缓存
-            self.__startlogin_official(username, password)
-        else:
-            print("please check your paramse")
-            raise SystemError
+        self.__verify_str(cookie, "cookie")
+        self.__verify_str(token, "token")
+        self.headers["Cookie"] = cookie
+        self.params["token"] = token
+        self.proxies = proxies
 
     def __verify_str(self, input_string, param_name):
         """
@@ -237,7 +224,10 @@ def __login_official(self, username, password):
         }
         # 获取token的url
         bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login"
-        res = self.s.post(bizlogin_url, data=data, headers=self.headers).json()
+        res = self.s.post(bizlogin_url,
+                          data=data,
+                          headers=self.headers,
+                          proxies=self.proxies).json()
 
         try:
             # 截取字符串中的token参数
@@ -295,7 +285,8 @@ def official_info(self, nickname, begin=0, count=5):
             # 返回与输入公众号名称最接近的公众号信息
             official = self.s.get(search_url,
                                   headers=self.headers,
-                                  params=self.params)
+                                  params=self.params,
+                                  proxies=self.proxies)
             return official.json()["list"]
         except Exception:
             raise Exception(u"公众号名称错误或cookie、token错误，请重新输入")
@@ -449,5 +440,8 @@ def __get_articles_data(self,
         }
         self.params.update(params)
 
-        data = self.s.get(appmsg_url, headers=self.headers, params=self.params)
+        data = self.s.get(appmsg_url,
+                          headers=self.headers,
+                          params=self.params,
+                          proxies=self.proxies)
         return data.json()
diff --git a/wechatarticles/GetUrls.py b/wechatarticles/GetUrls.py
@@ -6,8 +6,14 @@ class PCUrls(object):
     """
     通过PC端的wechat，获取需要爬取的微信公众号的推文链接
     """
-
-    def __init__(self, biz, uin, cookie):
+    def __init__(self,
+                 biz,
+                 uin,
+                 cookie,
+                 proxies={
+                     'http': None,
+                     'https': None
+                 }):
         """
         初始化参数
         Parameters
@@ -25,9 +31,8 @@ def __init__(self, biz, uin, cookie):
         self.s = requests.session()
         self.__biz = biz
         self.uin = uin
-        self.headers = {
-            'Cookies': cookie
-        }
+        self.headers = {'Cookies': cookie}
+        self.proxies = proxies
 
     def get_urls(self, key, offset='0'):
         """
@@ -89,22 +94,26 @@ def get_urls(self, key, offset='0'):
         }
         origin_url = 'https://mp.weixin.qq.com/mp/profile_ext'
 
-        msg_json = self.s.get(origin_url, params=self.params,
-                              headers=self.headers).json()
+        msg_json = self.s.get(origin_url,
+                              params=self.params,
+                              headers=self.headers,
+                              proxies=self.proxies).json()
         if 'general_msg_list' in msg_json.keys():
-            lst = [item for item in eval(msg_json['general_msg_list'])[
-                'list'] if 'app_msg_ext_info' in item.keys()]
+            lst = [
+                item for item in eval(msg_json['general_msg_list'])['list']
+                if 'app_msg_ext_info' in item.keys()
+            ]
             return lst
 
         raise Exception(
-            'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key')
+            'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key'
+        )
 
 
 class MobileUrls(object):
     """
     通过移动端的wechat，获取需要爬取的微信公众号的推文链接
     """
-
     def __init__(self, biz, cookie):
         """
         初始化参数
@@ -121,7 +130,8 @@ def __init__(self, biz, cookie):
         self.s = requests.session()
         self.__biz = biz
         self.headers = {
-            'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile',
+            'User-Agent':
+            'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile',
             'Cookie': cookie
         }
 
@@ -184,12 +194,17 @@ def get_urls(self, appmsg_token, offset='0'):
         }
         origin_url = 'https://mp.weixin.qq.com/mp/profile_ext'
 
-        msg_json = self.s.get(origin_url, params=self.params,
-                              headers=self.headers).json()
+        msg_json = self.s.get(origin_url,
+                              params=self.params,
+                              headers=self.headers,
+                              proxies=self.proxies).json()
         if 'general_msg_list' in msg_json.keys():
-            lst = [item for item in eval(msg_json['general_msg_list'])[
-                'list'] if 'app_msg_ext_info' in item.keys()]
+            lst = [
+                item for item in eval(msg_json['general_msg_list'])['list']
+                if 'app_msg_ext_info' in item.keys()
+            ]
             return lst
 
         raise Exception(
-            'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key')
+            'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key'
+        )
diff --git a/wechatarticles/Url2Html.py b/wechatarticles/Url2Html.py
@@ -42,7 +42,7 @@ def download_img(self, url):
                 img = f.read()
             return imgpath, img
 
-        response = requests.get(url)
+        response = requests.get(url, proxies=self.proxies)
         img = response.content
         with open(imgpath, 'wb') as f:
             f.write(img)
@@ -140,7 +140,7 @@ def rename_title(self, title, html):
                              '[{}]-{}-{}'.format(account_name, date, title))
         return title
 
-    def run(self, url, mode, **kwargs):
+    def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs):
         """
         启动函数
         url: 微信文章链接
@@ -156,17 +156,18 @@ def run(self, url, mode, **kwargs):
             proxies: 代理
             img_path: 图片下载路径
         """
+        proxies = self.proxies
         if mode == 1:
-            return requests.get(url).text
+            return requests.get(url, proxies=proxies).text
         elif mode in [2, 3, 4]:
             if 'img_path' in kwargs.keys():
                 self.img_path = kwargs['img_path']
             else:
                 return '{} 请输入保存图片路径!'.format(url)
             if mode == 2:
-                return requests.get(url).text
+                return requests.get(url, proxies=proxies).text
             elif mode == 3:
-                html = requests.get(url).text
+                html = requests.get(url, proxies=proxies).text
                 html_img, _ = self.replace_img(html)
                 return html_img
             else:
@@ -175,9 +176,9 @@ def run(self, url, mode, **kwargs):
                 else:
                     return '{} 请输入保存图片路径!'.format(url)
                 if mode == 2:
-                    return requests.get(url).text
+                    return requests.get(url, proxies=proxies).text
                 elif mode == 3:
-                    html = requests.get(url).text
+                    html = requests.get(url, proxies=proxies).text
                     html_img, _ = self.replace_img(html)
                     return html_img
                 else:
@@ -210,7 +211,8 @@ def run(self, url, mode, **kwargs):
                         title = self.rename_title(title, html)
 
                     html_img, _ = self.replace_img(html)
-                    with open('{}.html'.format(title), 'w', encoding='utf-8') as f:
+                    with open('{}.html'.format(title), 'w',
+                              encoding='utf-8') as f:
                         f.write(html_img)
                     return '{} success!'.format(url)
         else:
@@ -226,4 +228,4 @@ def run(self, url, mode, **kwargs):
     uh = Url2Html()
     for url in url_lst:
         s = uh.run(url, mode=4, img_path='D:\\imgs')
-        print(s)
+        print(s)
diff --git a/wechatarticles/nickname2biz.py b/wechatarticles/nickname2biz.py
@@ -11,7 +11,15 @@
 
 class nickname2biz(object):
     """输入公众号名称转biz"""
-    def __init__(self, cookie, token=None, method=None, t=120):
+    def __init__(self,
+                 cookie,
+                 token=None,
+                 method=None,
+                 t=120,
+                 proxies={
+                     'http': None,
+                     'https': None,
+                 }):
         """
         cookie: 平台登录的cookie
         token: 官方获取时需要token
@@ -36,14 +44,17 @@ def __init__(self, cookie, token=None, method=None, t=120):
         self.token = token
         self.t = t
         self.biz_name = '{}, {}'
+        self.proxies = proxies
 
     def run(self, nickname_lst):
         if self.method == 'xigua':
             return self.xigua(nickname_lst)
         elif self.method == 'qingbo':
             return self.qingbo(nickname_lst)
         else:
-            s = ArticlesUrls(cookie=self.cookie, token=self.token)
+            s = ArticlesUrls(cookie=self.cookie,
+                             token=self.token,
+                             proxies=self.proxies)
             return self.office(s, nickname_lst)
 
     def office(self, s, nickname_lst):
@@ -67,7 +78,9 @@ def xigua(self, nickname_lst):
         self.res_lst = []
         for nickname in nickname_lst:
             try:
-                s = requests.get(url.format(nickname), headers=self.headers)
+                s = requests.get(url.format(nickname),
+                                 headers=self.headers,
+                                 proxies=self.proxies)
                 soup = bs(s.text, 'lxml')
                 infos = soup.find_all(class_="number-details")
                 if infos:
@@ -90,7 +103,9 @@ def qingbo(self, nickname_lst):
         self.res_lst = []
         for nickname in nickname_lst:
             try:
-                s = requests.get(url.format(nickname), headers=self.headers)
+                s = requests.get(url.format(nickname),
+                                 headers=self.headers,
+                                 proxies=self.proxies)
                 biz_lst = re.findall(
                     r'<input type="hidden" class="biz" value="(.+)">?', s.text)
                 if biz_lst != []: