-
Notifications
You must be signed in to change notification settings - Fork 34
/
count-tweets.py
142 lines (118 loc) · 4.01 KB
/
count-tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
'''
count-tweets.py
Counts number of tweets over time, either from a file in json format, or
from a MongoDB collection of tweets
@p_barbera
Usage:
## count tweets in a file (tweets in json format), by minute
python count-tweets -f tweets.json -t minute
## count tweets in a file (tweets in json format), by hour
python count-tweets -f tweets.json -t hour
## count tweets in a MongoDB collection, by minute
python count-tweets -host localhost -db tweets -c example -t minute
'''
import argparse
import pymongo
import json
from pymongo import Connection
from datetime import datetime
# arguments
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file',
help = 'name of file with tweets in json format')
parser.add_argument('-t', '--time', required=True,
help = 'time period for tweet counting')
parser.add_argument('-host', '--host')
parser.add_argument('-u', '--user', default='')
parser.add_argument('-pwd', '--password', default='')
parser.add_argument('-db', '--database')
parser.add_argument('-c', '--collection')
args = parser.parse_args()
## function to count tweets in a file
def count_tweets_json(filename, time):
times_list = {}
f = open(filename, 'r')
for line in f:
try:
t = json.loads(line)
except:
continue
try:
tweet_time = datetime.strptime(t['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
except:
continue
if time == 'minute':
tweet_time = datetime.strftime(tweet_time, '%Y/%m/%d %H:%M')
times_list[tweet_time] = 1 + times_list.get(tweet_time,0)
if time == 'hour':
tweet_time = datetime.strftime(tweet_time, '%Y/%m/%d %H')
times_list[tweet_time] = 1 + times_list.get(tweet_time,0)
if time == 'day':
tweet_time = datetime.strftime(tweet_time, '%Y/%m/%d')
times_list[tweet_time] = 1 + times_list.get(tweet_time,0)
# sort by date
times = times_list.items()
times.sort()
# display list of times and number of tweets
for ht,a in times:
print ht + "," + str(a)
## function to count tweets in MongoDB
def count_tweets_mongodb(db, collection_name, time):
if time == 'minute':
times = db[collection_name].aggregate([
{ "$group" : { "_id" : {
"year" : { "$year" : "$timestamp"},
"month" : { "$month" : "$timestamp"},
"day": { "$dayOfMonth" : "$timestamp"},
"hour": { "$hour" : "$timestamp"},
"minute": { "$minute" : "$timestamp"}
},
"count" : { "$sum" : 1 } } },
{ "$sort" : { "_id.year": 1, "_id.month" : 1, "_id.day" : 1, '_id.hour' : 1,
"_id.minute":1 } }
])
for t in times['result']:
dt = t['_id']
dt = "%04d/%02d/%02d %02d:%02d" % (dt['year'], dt['month'], dt['day'], dt['hour'], dt['minute'])
print dt + "," + str(t['count'])
if time == 'hour':
times = db[collection_name].aggregate([
{ "$group" : { "_id" : {
"year" : { "$year" : "$timestamp"},
"month" : { "$month" : "$timestamp"},
"day": { "$dayOfMonth" : "$timestamp"},
"hour": { "$hour" : "$timestamp"}
},
"count" : { "$sum" : 1 } } },
{ "$sort" : { "_id.year": 1, "_id.month" : 1, "_id.day" : 1, '_id.hour' : 1 } }
])
for t in times['result']:
dt = t['_id']
dt = "%04d/%02d/%02d %02d" % (dt['year'], dt['month'], dt['day'], dt['hour'])
print dt + "," + str(t['count'])
if time == 'day':
times = db[collection_name].aggregate([
{ "$group" : { "_id" : {
"year" : { "$year" : "$timestamp"},
"month" : { "$month" : "$timestamp"},
"day": { "$dayOfMonth" : "$timestamp"} },
"count" : { "$sum" : 1 } } },
{ "$sort" : { "_id.year": 1, "_id.month" : 1, "_id.day" : 1 } }
])
for t in times['result']:
dt = t['_id']
dt = "%04d/%02d/%02d" % (dt['year'], dt['month'], dt['day'])
print dt + "," + str(t['count'])
## counting tweets from file
if args.file is not None:
count_tweets_json(args.file, args.time)
# counting tweets from MongoDB
if args.host is not None:
try:
connection = pymongo.Connection(args.host)
except:
print 'Connection error'
db = connection[args.database]
if args.user != '':
db.authenticate(args.user, args.password)
count_tweets_mongodb(db, args.collection, args.time)