Skip to content

Commit

Permalink
add proxy
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Oct 13, 2020
1 parent 7ccc88d commit 9ee8203
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 65 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="wechatarticles",
version="0.5.2",
version="0.5.3",
author="wnma3mz",
author_email="[email protected]",
description="wechat articles scrapy",
Expand Down
35 changes: 23 additions & 12 deletions wechatarticles/ArticlesInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,13 @@ class ArticlesInfo(object):
"""
登录WeChat,获取更加详细的推文信息。如点赞数、阅读数、评论等
"""

def __init__(self, appmsg_token, cookie):
def __init__(self,
appmsg_token,
cookie,
proxies={
'http': None,
'https': None
}):
"""
初始化参数
Parameters
Expand All @@ -28,14 +33,14 @@ def __init__(self, appmsg_token, cookie):
self.headers = {
"User-Agent":
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
"Cookie":
cookie
"Cookie": cookie
}
self.data = {
"is_only_read": "1",
"is_temp_url": "0",
"appmsg_type": "9", # 新参数,不加入无法获取like_num
"appmsg_type": "9", # 新参数,不加入无法获取like_num
}
self.proxies = proxies

def __verify_url(self, article_url):
"""
Expand Down Expand Up @@ -70,7 +75,8 @@ def read_like_nums(self, article_url):
"""
try:
appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"]
return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat["old_like_num"]
return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat[
"old_like_num"]
except Exception:
raise Exception("params is error, please check your article_url")

Expand Down Expand Up @@ -123,8 +129,11 @@ def comments(self, article_url):
__biz, _, idx, _ = self.__get_params(article_url)
getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100"
try:
url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url))
comment_json = self.s.get(url, headers=self.headers).json()
url = getcomment_url.format(__biz, idx,
self.__get_comment_id(article_url))
comment_json = self.s.get(url,
headers=self.headers,
proxies=self.proxies).json()
except Exception as e:
print(e)
comment_json = {}
Expand All @@ -143,7 +152,7 @@ def __get_comment_id(self, article_url):
str:
comment_id获取评论必要参数
"""
res = self.s.get(article_url, data=self.data)
res = self.s.get(article_url, data=self.data, proxies=self.proxies)
# 使用正则提取comment_id
comment_id = re.findall(r'comment_id = "\d+"',
res.text)[0].split(" ")[-1][1:-1]
Expand Down Expand Up @@ -203,16 +212,18 @@ def __get_appmsgext(self, article_url):

# 将params参数换到data中请求。这一步貌似不换也行
origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?"
appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token)
appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(
self.appmsg_token)
self.data["__biz"] = __biz
self.data["mid"] = mid
self.data["sn"] = sn
self.data["idx"] = idx

# appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format(
# __biz, mid, sn, idx, self.appmsg_token)
appmsgext_json = requests.post(
appmsgext_url, headers=self.headers, data=self.data).json()
appmsgext_json = requests.post(appmsgext_url,
headers=self.headers,
data=self.data).json()

if "appmsgstat" not in appmsgext_json.keys():
raise Exception(
Expand Down
38 changes: 16 additions & 22 deletions wechatarticles/ArticlesUrls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,11 @@ class ArticlesUrls(object):
"""
获取需要爬取的微信公众号的推文链接
"""
def __init__(self, username=None, password=None, cookie=None, token=None):
def __init__(self, cookie, token, proxies={'http': None, 'https': None}):
"""
初始化参数
Parameters
----------
username: str
用户账号
password: str
用户密码
token : str
登录微信公众号平台之后获取的token
cookie : str
Expand All @@ -39,20 +35,11 @@ def __init__(self, username=None, password=None, cookie=None, token=None):
}

# 手动输入cookie和token登录
if (cookie != None) and (token != None):
self.__verify_str(cookie, "cookie")
self.__verify_str(token, "token")
self.headers["Cookie"] = cookie
self.params["token"] = token
# 扫描二维码登录
elif (username != None) and (password != None):
self.__verify_str(username, "username")
self.__verify_str(password, "password")
# 暂不支持cookie缓存
self.__startlogin_official(username, password)
else:
print("please check your paramse")
raise SystemError
self.__verify_str(cookie, "cookie")
self.__verify_str(token, "token")
self.headers["Cookie"] = cookie
self.params["token"] = token
self.proxies = proxies

def __verify_str(self, input_string, param_name):
"""
Expand Down Expand Up @@ -237,7 +224,10 @@ def __login_official(self, username, password):
}
# 获取token的url
bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login"
res = self.s.post(bizlogin_url, data=data, headers=self.headers).json()
res = self.s.post(bizlogin_url,
data=data,
headers=self.headers,
proxies=self.proxies).json()

try:
# 截取字符串中的token参数
Expand Down Expand Up @@ -295,7 +285,8 @@ def official_info(self, nickname, begin=0, count=5):
# 返回与输入公众号名称最接近的公众号信息
official = self.s.get(search_url,
headers=self.headers,
params=self.params)
params=self.params,
proxies=self.proxies)
return official.json()["list"]
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
Expand Down Expand Up @@ -449,5 +440,8 @@ def __get_articles_data(self,
}
self.params.update(params)

data = self.s.get(appmsg_url, headers=self.headers, params=self.params)
data = self.s.get(appmsg_url,
headers=self.headers,
params=self.params,
proxies=self.proxies)
return data.json()
49 changes: 32 additions & 17 deletions wechatarticles/GetUrls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ class PCUrls(object):
"""
通过PC端的wechat,获取需要爬取的微信公众号的推文链接
"""

def __init__(self, biz, uin, cookie):
def __init__(self,
biz,
uin,
cookie,
proxies={
'http': None,
'https': None
}):
"""
初始化参数
Parameters
Expand All @@ -25,9 +31,8 @@ def __init__(self, biz, uin, cookie):
self.s = requests.session()
self.__biz = biz
self.uin = uin
self.headers = {
'Cookies': cookie
}
self.headers = {'Cookies': cookie}
self.proxies = proxies

def get_urls(self, key, offset='0'):
"""
Expand Down Expand Up @@ -89,22 +94,26 @@ def get_urls(self, key, offset='0'):
}
origin_url = 'https://mp.weixin.qq.com/mp/profile_ext'

msg_json = self.s.get(origin_url, params=self.params,
headers=self.headers).json()
msg_json = self.s.get(origin_url,
params=self.params,
headers=self.headers,
proxies=self.proxies).json()
if 'general_msg_list' in msg_json.keys():
lst = [item for item in eval(msg_json['general_msg_list'])[
'list'] if 'app_msg_ext_info' in item.keys()]
lst = [
item for item in eval(msg_json['general_msg_list'])['list']
if 'app_msg_ext_info' in item.keys()
]
return lst

raise Exception(
'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key')
'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key'
)


class MobileUrls(object):
"""
通过移动端的wechat,获取需要爬取的微信公众号的推文链接
"""

def __init__(self, biz, cookie):
"""
初始化参数
Expand All @@ -121,7 +130,8 @@ def __init__(self, biz, cookie):
self.s = requests.session()
self.__biz = biz
self.headers = {
'User-Agent': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile',
'User-Agent':
'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile',
'Cookie': cookie
}

Expand Down Expand Up @@ -184,12 +194,17 @@ def get_urls(self, appmsg_token, offset='0'):
}
origin_url = 'https://mp.weixin.qq.com/mp/profile_ext'

msg_json = self.s.get(origin_url, params=self.params,
headers=self.headers).json()
msg_json = self.s.get(origin_url,
params=self.params,
headers=self.headers,
proxies=self.proxies).json()
if 'general_msg_list' in msg_json.keys():
lst = [item for item in eval(msg_json['general_msg_list'])[
'list'] if 'app_msg_ext_info' in item.keys()]
lst = [
item for item in eval(msg_json['general_msg_list'])['list']
if 'app_msg_ext_info' in item.keys()
]
return lst

raise Exception(
'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key')
'Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key'
)
20 changes: 11 additions & 9 deletions wechatarticles/Url2Html.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def download_img(self, url):
img = f.read()
return imgpath, img

response = requests.get(url)
response = requests.get(url, proxies=self.proxies)
img = response.content
with open(imgpath, 'wb') as f:
f.write(img)
Expand Down Expand Up @@ -140,7 +140,7 @@ def rename_title(self, title, html):
'[{}]-{}-{}'.format(account_name, date, title))
return title

def run(self, url, mode, **kwargs):
def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs):
"""
启动函数
url: 微信文章链接
Expand All @@ -156,17 +156,18 @@ def run(self, url, mode, **kwargs):
proxies: 代理
img_path: 图片下载路径
"""
proxies = self.proxies
if mode == 1:
return requests.get(url).text
return requests.get(url, proxies=proxies).text
elif mode in [2, 3, 4]:
if 'img_path' in kwargs.keys():
self.img_path = kwargs['img_path']
else:
return '{} 请输入保存图片路径!'.format(url)
if mode == 2:
return requests.get(url).text
return requests.get(url, proxies=proxies).text
elif mode == 3:
html = requests.get(url).text
html = requests.get(url, proxies=proxies).text
html_img, _ = self.replace_img(html)
return html_img
else:
Expand All @@ -175,9 +176,9 @@ def run(self, url, mode, **kwargs):
else:
return '{} 请输入保存图片路径!'.format(url)
if mode == 2:
return requests.get(url).text
return requests.get(url, proxies=proxies).text
elif mode == 3:
html = requests.get(url).text
html = requests.get(url, proxies=proxies).text
html_img, _ = self.replace_img(html)
return html_img
else:
Expand Down Expand Up @@ -210,7 +211,8 @@ def run(self, url, mode, **kwargs):
title = self.rename_title(title, html)

html_img, _ = self.replace_img(html)
with open('{}.html'.format(title), 'w', encoding='utf-8') as f:
with open('{}.html'.format(title), 'w',
encoding='utf-8') as f:
f.write(html_img)
return '{} success!'.format(url)
else:
Expand All @@ -226,4 +228,4 @@ def run(self, url, mode, **kwargs):
uh = Url2Html()
for url in url_lst:
s = uh.run(url, mode=4, img_path='D:\\imgs')
print(s)
print(s)
23 changes: 19 additions & 4 deletions wechatarticles/nickname2biz.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@

class nickname2biz(object):
"""输入公众号名称转biz"""
def __init__(self, cookie, token=None, method=None, t=120):
def __init__(self,
cookie,
token=None,
method=None,
t=120,
proxies={
'http': None,
'https': None,
}):
"""
cookie: 平台登录的cookie
token: 官方获取时需要token
Expand All @@ -36,14 +44,17 @@ def __init__(self, cookie, token=None, method=None, t=120):
self.token = token
self.t = t
self.biz_name = '{}, {}'
self.proxies = proxies

def run(self, nickname_lst):
if self.method == 'xigua':
return self.xigua(nickname_lst)
elif self.method == 'qingbo':
return self.qingbo(nickname_lst)
else:
s = ArticlesUrls(cookie=self.cookie, token=self.token)
s = ArticlesUrls(cookie=self.cookie,
token=self.token,
proxies=self.proxies)
return self.office(s, nickname_lst)

def office(self, s, nickname_lst):
Expand All @@ -67,7 +78,9 @@ def xigua(self, nickname_lst):
self.res_lst = []
for nickname in nickname_lst:
try:
s = requests.get(url.format(nickname), headers=self.headers)
s = requests.get(url.format(nickname),
headers=self.headers,
proxies=self.proxies)
soup = bs(s.text, 'lxml')
infos = soup.find_all(class_="number-details")
if infos:
Expand All @@ -90,7 +103,9 @@ def qingbo(self, nickname_lst):
self.res_lst = []
for nickname in nickname_lst:
try:
s = requests.get(url.format(nickname), headers=self.headers)
s = requests.get(url.format(nickname),
headers=self.headers,
proxies=self.proxies)
biz_lst = re.findall(
r'<input type="hidden" class="biz" value="(.+)">?', s.text)
if biz_lst != []:
Expand Down

0 comments on commit 9ee8203

Please sign in to comment.