Skip to content

Commit

Permalink
add wechat book api
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Feb 10, 2021
1 parent 0a75499 commit 85a679d
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 323 deletions.
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,15 @@

setuptools.setup(
name="wechatarticles",
version="0.5.7",
version="0.5.8",
author="wnma3mz",
author_email="[email protected]",
description="wechat articles scrapy",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/wnma3mz/wechat_articles_spider",
packages=setuptools.find_packages(),
install_requires=[
'requests>=2.20.0', 'beautifulsoup4>=4.7.1'
],
install_requires=["requests>=2.20.0", "beautifulsoup4>=4.7.1"],
classifiers=(
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
Expand Down
122 changes: 64 additions & 58 deletions wechatarticles/ArticlesAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@ class ArticlesAPI(object):
整合ArticlesInfo和ArticlesInfo, 方便调用
"""

def __init__(self,
username=None,
password=None,
official_cookie=None,
token=None,
appmsg_token=None,
wechat_cookie=None,
outfile=None):
def __init__(
self,
username=None,
password=None,
official_cookie=None,
token=None,
appmsg_token=None,
wechat_cookie=None,
outfile=None,
):
"""
初始化参数
Parameters
Expand Down Expand Up @@ -47,9 +49,9 @@ def __init__(self,
raise SystemError("please check your paramse")

# 支持两种方式, mitmproxy自动获取参数和手动获取参数
if (appmsg_token == None) and (wechat_cookie == None) and (outfile !=
None):
if (appmsg_token == None) and (wechat_cookie == None) and (outfile != None):
from .ReadOutfile import Reader

reader = Reader()
reader.contral(outfile)
self.appmsg_token, self.cookie = reader.request(outfile)
Expand Down Expand Up @@ -83,38 +85,38 @@ def complete_info(self, nickname, begin=0, count=5):
'comments': 文章评论信息
{
"base_resp": {
"errmsg": "ok",
"errmsg": "ok",
"ret": 0
},
},
"elected_comment": [
{
"content": 用户评论文字,
"content_id": "6846263421277569047",
"create_time": 1520098511,
"id": 3,
"is_from_friend": 0,
"is_from_me": 0,
"content": 用户评论文字,
"content_id": "6846263421277569047",
"create_time": 1520098511,
"id": 3,
"is_from_friend": 0,
"is_from_me": 0,
"is_top": 0, 是否被置顶
"like_id": 10001,
"like_num": 3,
"like_status": 0,
"logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132",
"my_id": 23,
"nick_name": 评论用户的名字,
"like_id": 10001,
"like_num": 3,
"like_status": 0,
"logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132",
"my_id": 23,
"nick_name": 评论用户的名字,
"reply": {
"reply_list": [ ]
}
}
],
],
"elected_comment_total_cnt": 3, 评论总数
"enabled": 1,
"friend_comment": [ ],
"is_fans": 1,
"logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132",
"my_comment": [ ],
"nick_name": 当前用户名,
"enabled": 1,
"friend_comment": [ ],
"is_fans": 1,
"logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132",
"my_comment": [ ],
"nick_name": 当前用户名,
"only_fans_can_comment": false
},
},
'cover': 封面的url'digest': 文章摘要,
'itemidx': 1,
'like_num': 18, 文章点赞数
Expand All @@ -128,15 +130,16 @@ def complete_info(self, nickname, begin=0, count=5):
"""
# 获取文章数据
artiacle_data = self.officical.articles(
nickname, begin=str(begin), count=str(count))
nickname, begin=str(begin), count=str(count)
)

# 提取每个文章的url,获取文章的点赞、阅读、评论信息,并加入到原来的json中
for data in artiacle_data:
article_url = data["link"]
comments = self.wechat.comments(article_url)
read_like_nums = self.wechat.read_like_nums(article_url)
data["comments"] = comments
data["read_num"], data["like_num"], data['old_like_num'] = read_like_nums
data["read_num"], data["like_num"], data["old_like_num"] = read_like_nums

return artiacle_data

Expand All @@ -147,7 +150,7 @@ def __extract_info(self, articles_data):
comments = self.wechat.comments(article_url)
read_like_nums = self.wechat.read_like_nums(article_url)
data["comments"] = comments
data["read_num"], data["like_num"], data['old_like_num'] = read_like_nums
data["read_num"], data["like_num"], data["old_like_num"] = read_like_nums

return articles_data

Expand All @@ -172,38 +175,38 @@ def continue_info(self, nickname, begin=0):
'comments': 文章评论信息
{
"base_resp": {
"errmsg": "ok",
"errmsg": "ok",
"ret": 0
},
},
"elected_comment": [
{
"content": 用户评论文字,
"content_id": "6846263421277569047",
"create_time": 1520098511,
"id": 3,
"is_from_friend": 0,
"is_from_me": 0,
"content": 用户评论文字,
"content_id": "6846263421277569047",
"create_time": 1520098511,
"id": 3,
"is_from_friend": 0,
"is_from_me": 0,
"is_top": 0, 是否被置顶
"like_id": 10001,
"like_num": 3,
"like_status": 0,
"logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132",
"my_id": 23,
"nick_name": 评论用户的名字,
"like_id": 10001,
"like_num": 3,
"like_status": 0,
"logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132",
"my_id": 23,
"nick_name": 评论用户的名字,
"reply": {
"reply_list": [ ]
}
}
],
],
"elected_comment_total_cnt": 3, 评论总数
"enabled": 1,
"friend_comment": [ ],
"is_fans": 1,
"logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132",
"my_comment": [ ],
"nick_name": 当前用户名,
"enabled": 1,
"friend_comment": [ ],
"is_fans": 1,
"logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132",
"my_comment": [ ],
"nick_name": 当前用户名,
"only_fans_can_comment": false
},
},
'cover': 封面的url'digest': 文章摘要,
'itemidx': 1,
'like_num': 18, 文章点赞数
Expand All @@ -222,7 +225,9 @@ def continue_info(self, nickname, begin=0):
# 获取文章数据
artiacle_datas.append(
self.officical.articles(
nickname, begin=str(begin), count=str(count)))
nickname, begin=str(begin), count=str(count)
)
)
except Exception as e:
print(e)
break
Expand All @@ -232,6 +237,7 @@ def continue_info(self, nickname, begin=0):

def flatten(x):
return [y for l in x for y in flatten(l)] if type(x) is list else [x]

# flatten = lambda x: [y for l in x for y in flatten(l)] if type(x) is list else [x]
print("第{}篇文章爬取失败,请过段时间再次尝试或换个帐号继续爬取".format(begin))
return self.__extract_info(flatten(artiacle_datas))
51 changes: 21 additions & 30 deletions wechatarticles/ArticlesInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,8 @@ class ArticlesInfo(object):
"""
登录WeChat,获取更加详细的推文信息。如点赞数、阅读数、评论等
"""
def __init__(self,
appmsg_token,
cookie,
proxies={
'http': None,
'https': None
}):

def __init__(self, appmsg_token, cookie, proxies={"http": None, "https": None}):
"""
初始化参数
Parameters
Expand All @@ -32,9 +27,8 @@ def __init__(self,
self.s.trust_env = False
self.appmsg_token = appmsg_token
self.headers = {
"User-Agent":
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
"Cookie": cookie
"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
"Cookie": cookie,
}
self.data = {
"is_only_read": "1",
Expand All @@ -58,8 +52,7 @@ def __verify_url(self, article_url):
verify_lst = ["mp.weixin.qq.com", "__biz", "mid", "sn", "idx"]
for string in verify_lst:
if string not in article_url:
raise Exception(
"params is error, please check your article_url")
raise Exception("params is error, please check your article_url")

def read_like_nums(self, article_url):
"""
Expand All @@ -76,8 +69,11 @@ def read_like_nums(self, article_url):
"""
try:
appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"]
return appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat[
"old_like_num"]
return (
appmsgstat["read_num"],
appmsgstat["like_num"],
appmsgstat["old_like_num"],
)
except Exception:
raise Exception("params is error, please check your article_url")

Expand Down Expand Up @@ -130,11 +126,10 @@ def comments(self, article_url):
__biz, _, idx, _ = self.__get_params(article_url)
getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100"
try:
url = getcomment_url.format(__biz, idx,
self.__get_comment_id(article_url))
comment_json = self.s.get(url,
headers=self.headers,
proxies=self.proxies).json()
url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url))
comment_json = self.s.get(
url, headers=self.headers, proxies=self.proxies
).json()
except Exception as e:
print(e)
comment_json = {}
Expand All @@ -155,8 +150,7 @@ def __get_comment_id(self, article_url):
"""
res = self.s.get(article_url, data=self.data, proxies=self.proxies)
# 使用正则提取comment_id
comment_id = re.findall(r'comment_id = "\d+"',
res.text)[0].split(" ")[-1][1:-1]
comment_id = re.findall(r'comment_id = "\d+"', res.text)[0].split(" ")[-1][1:-1]
return comment_id

def __get_params(self, article_url):
Expand All @@ -177,7 +171,7 @@ def __get_params(self, article_url):

# 切分url, 提取相应的参数
string_lst = article_url.split("?")[1].split("&")
dict_value = [string[string.index("=") + 1:] for string in string_lst]
dict_value = [string[string.index("=") + 1 :] for string in string_lst]
__biz, mid, idx, sn, *_ = dict_value
sn = sn[:-3] if sn[-3] == "#" else sn

Expand Down Expand Up @@ -213,21 +207,18 @@ def __get_appmsgext(self, article_url):

# 将params参数换到data中请求。这一步貌似不换也行
origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?"
appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(
self.appmsg_token)
appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token)
self.data["__biz"] = __biz
self.data["mid"] = mid
self.data["sn"] = sn
self.data["idx"] = idx

# appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format(
# __biz, mid, sn, idx, self.appmsg_token)
appmsgext_json = requests.post(appmsgext_url,
headers=self.headers,
data=self.data,
proxies=self.proxies).json()
appmsgext_json = requests.post(
appmsgext_url, headers=self.headers, data=self.data, proxies=self.proxies
).json()

if "appmsgstat" not in appmsgext_json.keys():
raise Exception(
"get info error, please check your cookie and appmsg_token")
raise Exception("get info error, please check your cookie and appmsg_token")
return appmsgext_json
Loading

0 comments on commit 85a679d

Please sign in to comment.