diff --git a/bustag/spider/bus_spider.py b/bustag/spider/bus_spider.py index 45e3464..9b6a564 100644 --- a/bustag/spider/bus_spider.py +++ b/bustag/spider/bus_spider.py @@ -9,7 +9,7 @@ from .db import save, Item from bustag.util import APP_CONFIG, get_full_url, logger router = get_router() -MAXPAGE = 30 +MAXPAGE = 300 def get_url_by_fanhao(fanhao): diff --git a/bustag/spider/parser.py b/bustag/spider/parser.py index 76fd430..4b1db18 100644 --- a/bustag/spider/parser.py +++ b/bustag/spider/parser.py @@ -1,6 +1,7 @@ ''' html parser to extract data ''' +import sys import re from collections import namedtuple from requests_html import HTML @@ -25,7 +26,13 @@ def parse_item(text): title_css = 'body > div.container > h3' title = html.find(title_css)[0].text cover_img_css = 'body > div.container > div.row.movie > div.col-md-9.screencap > a' - cover_img_url = "http://www.javbus.com" + html.find(cover_img_css)[0].attrs['href'] + a = "http://www.javbus.com" + b = html.find(cover_img_css)[0].attrs['href'] + if re.match(r'^https?:/{2}\w.+$', b): + cover_img_url = b + else: + cover_img_url = a + b + #cover_img_url = "http://www.javbus.com" + html.find(cover_img_css)[0].attrs['href'] tags_css = 'body > div.container > div.row.movie > div.col-md-3.info' tags = html.find(tags_css)[0].find('p') release_date = tags[1].text diff --git a/requirements.txt b/requirements.txt index 09f37b8..286b00b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -120,4 +120,4 @@ wrapt==1.11.2 yarg==0.1.9 yarl==1.3.0 zipp==0.5.2 --e git+https://github.com/dockerfile-test/bustag.git#egg=bustag +-e git+https://github.com/dockerfile-test/bustag.git@dev#egg=bustag