-
Notifications
You must be signed in to change notification settings - Fork 2
/
vh_fetch.py
executable file
·94 lines (83 loc) · 2.84 KB
/
vh_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#! /usr/bin/python
#
# Copyright (C) 2012 Vivek Haldar
#
# This sucks down all my RSS feeds and spits out articles from the
# last 24 hours out to text files in the current directory.
#
# Author: Vivek Haldar <[email protected]>
import argparse
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from threading import Thread
import feedparser
import output_file
import output_prn
# Fetch each RSS feed in a thread by itself, so that we can grab all of them in
# parallel.
class Fetcher(Thread):
def __init__(self, rss_url, articles, days):
super(Fetcher, self).__init__()
self._rss_url = rss_url
self._articles = articles
self._days = days
def run(self):
s = self._rss_url
print s
try:
self.fetch()
except Exception as e:
print '== error==', e
def fetch(self):
f = feedparser.parse(s)
print f.feed.title
self._articles[f.feed.title] = []
for e in f.entries:
if 'published_parsed' in e.keys():
pub = e.published_parsed
else:
pub = e.updated_parsed
now = datetime.now ()
pub_date = datetime(pub.tm_year, pub.tm_mon, pub.tm_mday,
pub.tm_hour, pub.tm_min)
daydelta = timedelta(days = self._days)
if (now - pub_date) < daydelta:
print ' ', e.title
if 'content' in e.keys():
body = e.content[0].value
else:
body = e.summary
plain_text = BeautifulSoup(body).get_text()
self._articles[f.feed.title].append((e.title, plain_text))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("days", help='Fetch articles going back this many days',
type=int)
parser.add_argument("--prn", help='If true, use the print subscriptions and output prn',
action='store_true')
args = parser.parse_args()
print 'Fetching the last %d days' % args.days
print 'Print? ', args.prn
# Read subscriptions from other file.
subs_file = ''
if args.prn:
subs_file = 'subscriptions_print.py'
else:
subs_file = 'subscriptions.py'
subscriptions = eval(open(subs_file).read())
# This is a map: feed_title -> list of articles
articles = {}
threads = []
for s in subscriptions:
t = Fetcher(s, articles, args.days)
threads.append(t)
t.start()
# Join
for t in threads:
t.join(30.0) # 30 s timeout.
print "OK, got all the feeds..."
# OK, now we have the dict with all the content... ditch it out to files.
if args.prn:
output_prn.OutputPrn(articles).output()
else:
output_file.OutputFile(articles).output()