-
Notifications
You must be signed in to change notification settings - Fork 43
/
train_threads.py
78 lines (71 loc) · 2.61 KB
/
train_threads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from tqdm import tqdm
from MyBotometer import Botometer, NoTimelineError
from tweepy.error import TweepError
import tweepy
from argparse import ArgumentParser
import os
from queue import Queue
from threading import Thread
def crawling(data_q, pbar):
while not data_q.empty():
item = data_q.get()
path = os.path.join(save_path, '{}.json'.format(item['id']))
if os.path.exists(path):
pbar.update()
continue
if item['username'] is None:
result = 'the username is None'
json.dump(result, open(path, 'w'))
pbar.update()
continue
username = '@{}'.format(item['username'].strip())
pbar.set_postfix_str(username)
try:
result = bom.check_account(username)
except NoTimelineError:
result = 'this user does not have any tweets'
except TweepError:
result = 'api can\'t get the timeline'
json.dump(result, open(path, 'w'))
pbar.update()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('--dataset_name', type=str)
parser.add_argument('--threads', type=int, default=5)
parser.add_argument('--proxy', type=str)
args = parser.parse_args()
proxy = args.proxy
username_path = 'tmp/username/{}'.format(args.dataset_name)
if not os.path.exists(username_path):
raise ValueError
key = json.load(open('tmp/key.json'))
rapid_api_key = key['rapid_api_key']
consumer_key = key['consumer_key']
consumer_secret = key['consumer_secret']
access_token = key['access_token']
access_token_secret = key['access_token_secret']
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,
wait_on_rate_limit=True,
proxy=proxy,
parser=tweepy.parsers.JSONParser())
bom = Botometer(rapid_api_key=rapid_api_key,
twitter_api=api,
proxy=proxy)
usernames = json.load(open(os.path.join(username_path, 'users_test.json')))
pbar = tqdm(total=len(usernames), ncols=0)
save_path = 'tmp/scores/{}'.format(args.dataset_name)
if not os.path.exists(save_path):
os.makedirs(save_path)
queue = Queue()
for user in usernames:
queue.put(user)
threads = []
for i in range(args.threads):
threads.append(Thread(target=crawling, args=(queue, pbar,)))
for i in range(len(threads)):
threads[i].start()
for i in range(len(threads)):
threads[i].join()