-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_data_yelp_yahoo.py
73 lines (54 loc) · 2.15 KB
/
prepare_data_yelp_yahoo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import argparse
import requests
import tarfile
import os
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="data downloading")
parser.add_argument('--dataset', choices=["synthetic", "yahoo", "yelp", "omniglot", "all"],
default="all", help='dataset to use')
args = parser.parse_args()
if not os.path.exists("datasets"):
os.makedirs("datasets")
os.chdir("datasets")
synthetic_id = "1pEHLedf3ZSo7UrHdvR1VWPfWNTcN6oWH"
yahoo_id = "13azGlTuGdzWLCmgDmQPmvb_jcexVWX7i"
yelp_id = "1FT49oLNV8syhmGXEgiK6XTjEfMNqqEJJ"
omniglot_id = "1PbFAm2wtXfdnV7ZixImkBubC0A6-XwHU"
if args.dataset == "synthetic":
file_id = [synthetic_id]
elif args.dataset == "yahoo":
file_id = [yahoo_id]
elif args.dataset == "yelp":
file_id = [yelp_id]
elif args.dataset == "omniglot":
file_id = [omniglot_id]
else:
file_id = [synthetic_id, yahoo_id, yelp_id, omniglot_id]
destination = "datasets.tar.gz"
for file_id_e in file_id:
download_file_from_google_drive(file_id_e, destination)
tar = tarfile.open(destination, "r:gz")
tar.extractall()
tar.close()
os.remove(destination)
os.chdir("../")