-
-
Notifications
You must be signed in to change notification settings - Fork 641
/
rscrape1.py
49 lines (44 loc) · 1.63 KB
/
rscrape1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env python3
# Foundations of Python Network Programming, Third Edition
# https://github.com/brandon-rhodes/fopnp/blob/m/py3/chapter11/rscrape1.py
# Recursive scraper built using the Requests library.
import argparse, requests
from urllib.parse import urljoin, urlsplit
from lxml import etree
def GET(url):
response = requests.get(url)
if response.headers.get('Content-Type', '').split(';')[0] != 'text/html':
return
text = response.text
try:
html = etree.HTML(text)
except Exception as e:
print(' {}: {}'.format(e.__class__.__name__, e))
return
links = html.findall('.//a[@href]')
for link in links:
yield GET, urljoin(url, link.attrib['href'])
def scrape(start, url_filter):
further_work = {start}
already_seen = {start}
while further_work:
call_tuple = further_work.pop()
function, url, *etc = call_tuple
print(function.__name__, url, *etc)
for call_tuple in function(url, *etc):
if call_tuple in already_seen:
continue
already_seen.add(call_tuple)
function, url, *etc = call_tuple
if not url_filter(url):
continue
further_work.add(call_tuple)
def main(GET):
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter)
if __name__ == '__main__':
main(GET)