Skip to content

Commit

Permalink
update trust env and add download media
Browse files Browse the repository at this point in the history
  • Loading branch information
wnma3mz committed Jan 20, 2021
1 parent 1100999 commit 274b50b
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 153 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
dist/
build/
*.egg-info/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# 微信公众号文章爬虫(微信文章阅读点赞的获取)

![](https://img.shields.io/pypi/v/wechatarticles)

安装

`pip install wechatarticles`
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setuptools.setup(
name="wechatarticles",
version="0.5.6",
version="0.5.7",
author="wnma3mz",
author_email="[email protected]",
description="wechat articles scrapy",
Expand Down
1 change: 1 addition & 0 deletions wechatarticles/ArticlesInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(self,
None
"""
self.s = requests.session()
self.s.trust_env = False
self.appmsg_token = appmsg_token
self.headers = {
"User-Agent":
Expand Down
150 changes: 0 additions & 150 deletions wechatarticles/ReadOutfile.py

This file was deleted.

36 changes: 34 additions & 2 deletions wechatarticles/Url2Html.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,12 @@ def get_title(self, html):
html: 文章源码
"""
try:
title = html.split('activity-name">')[1].split('</h2')[0].strip()
# title = html.split('activity-name">')[1].split('</h2')[0].strip()
title = html.split('<h2')[1].split('</h2')[0].split('>')[1].strip()
return title
except Exception as e:
print(e)
print(html.split('<h2')[1].split('</h2')[0])
return ''

def article_info(self, html):
Expand Down Expand Up @@ -140,6 +142,30 @@ def rename_title(self, title, html):
'[{}]-{}-{}'.format(account_name, date, title))
return title

def download_media(self, html, title):
soup = bs(html, 'lxml')
# mp3
mpvoice_item_lst = soup.find_all('mpvoice')
base_url = 'https://res.wx.qq.com/voice/getvoice?mediaid='
for i, item in enumerate(mpvoice_item_lst, 1):
if os.path.isfile('{}-{}.mp3'.format(title, i)):
continue
doc = requests.get(base_url + item['voice_encode_fileid'])
with open('{}-{}.mp3'.format(title, i), 'wb') as f:
f.write(doc.content)

# video
if os.path.isfile('{}.mp4'.format(title)):
return ''
video_url = re.findall(r'url: \'(.+)\',\n', html)
if video_url:
video_url = [url for url in video_url if 'videoplayer' not in url]
if video_url:
video_url = video_url[0].replace(r'\x26', '&')
doc = requests.get(video_url)
with open('{}.mp4'.format(title), 'wb') as f:
f.write(doc.content)

def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs):
"""
启动函数
Expand All @@ -149,14 +175,15 @@ def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs):
2: 返回html源码,下载图片但不替换图片路径
3: 返回html源码,下载图片且替换图片路径
4: 保存html源码,下载图片且替换图片路径
5: 保存html源码,下载图片且替换图片路径,并下载视频与音频
kwargs:
account: 公众号名
title: 文章名
date: 日期
proxies: 代理
img_path: 图片下载路径
"""
self.proxies = proxies
self.proxies = proxies
if mode == 1:
return requests.get(url, proxies=proxies).text
elif mode in [2, 3, 4]:
Expand Down Expand Up @@ -210,6 +237,11 @@ def run(self, url, mode, proxies={'http': None, 'https': None}, **kwargs):
html = requests.get(url, proxies=proxies).text
title = self.rename_title(title, html)

try:
if mode == 5:
self.download_media(html, title)
except Exception as e:
print(fj, title)
html_img, _ = self.replace_img(html)
with open('{}.html'.format(title), 'w',
encoding='utf-8') as f:
Expand Down

0 comments on commit 274b50b

Please sign in to comment.