-
Notifications
You must be signed in to change notification settings - Fork 0
/
populate_verified_users.py
146 lines (112 loc) · 4.12 KB
/
populate_verified_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
""" Scrapes verified users using Twitter API to prep for matching later. """
from __future__ import print_function
import json
import argparse
import sqlite3
import sys
import tweepy
def connect_api():
""" Load config and use tweepy to connect to API. """
try:
auth_data = json.load(open("config/auth.json", "r"))
except:
print("Error reading auth file, please copy `config/auth_blank.json` to `config/auth.json` and fill out.")
sys.exit(1)
auth = tweepy.OAuthHandler(
auth_data["consumer_key"], auth_data["consumer_secret"]
)
auth.set_access_token(
auth_data["access_token"], auth_data["access_token_secret"]
)
api = tweepy.API(
auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True
)
conn = sqlite3.connect("data/verified_users")
db = conn.cursor()
db.execute(auth_data["table_spec"])
conn.commit()
return api, db, conn
def scrape_raw_verified(api, db, conn):
""" Who is the @verified account following? """
ids = []
for fid in tweepy.Cursor(api.friends_ids, screen_name="verified").items():
ids.append(fid)
db.execute(
"INSERT OR IGNORE INTO twitter_users (id) VALUES (?)",
(fid,))
if not len(ids) % 10:
conn.commit()
conn.commit()
return ids
def load_unhydrated(db):
""" Load all unhydrated IDs from the SQLite database. """
db.execute("SELECT id FROM twitter_users WHERE processed IS NULL;")
data = db.fetchall()
return [d[0] for d in data]
def batch_hydrate(all_ids, api, db, conn):
""" Split full ID list into batches of 100 and hydrate them. """
batches = [all_ids[i:i + 100] for i in range(0, len(all_ids), 100)]
for index, batch in enumerate(batches):
print(" Processing batch %s" % index)
hydrate_users(batch, api, db, conn)
def hydrate_users(ids, api, db, conn):
""" Hydrate the unhydrated users specified in `ids`"""
try:
users = api.lookup_users(user_ids=ids, include_entities=1)
except tweepy.TweepError:
print("Error completing user lookup; no results found.")
return
for user in users:
user_json = user._json
user_id = user_json["id"]
name = user_json.get("name", "No Name")
screen_name = user_json.get("screen_name", "No Screen Name")
followers = int(user_json.get("followers_count", 0))
location = user_json.get("location", "")
description = user_json.get("description", "")
print("%s (@%s, %s followers)\n%s\n%s" %
(name, screen_name, followers, location, description))
db.execute(
"""
UPDATE twitter_users SET name = ?, username = ?,
location = ?, bio = ?, followers = ?, processed = 1,
WHERE id = ?;
""",
[name, screen_name, location, description,
followers, user_id]
)
conn.commit()
def delete_non_us(db, conn):
""" Remove all non-US accounts to reduce size of data. """
non_us_locations = (
[unicode(x.trim()) for x in
open("config/non_us_locations.txt", "r").readlines()
if x.trim()])
for location in non_us_locations:
db.execute("DELETE FROM twitter_users WHERE location = ?", (location, ))
conn.commit()
def parse_arguments():
""" Execute main functions. """
parser = argparse.ArgumentParser(
description="Gathers a list of verified accounts on Twitter"
)
parser.add_argument("--rescan", help="Manually rescan verified users")
args = parser.parse_args()
api, db, conn = connect_api()
if args.rescan:
print("Scraping verified users...")
ids = scrape_raw_verified(api, db, conn)
else:
print("Loading unhydrated users from db...")
ids = load_unhydrated(db)
if ids:
print("%s users to process" % len(ids))
batch_hydrate(ids, api, db, conn)
print("Deleting non-us users...")
delete_non_us(db, conn)
print("Defragmenting database...")
db.execute("VACUUM")
conn.commit()
print("OK, done.")
if __name__ == "__main__":
parse_arguments()