-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_data.py
70 lines (51 loc) · 2 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import requests
import tarfile
import os
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="data downloading")
parser.add_argument('--dataset', choices=["yahoo", "snli", "short_yelp", "all"],
default="all", help='dataset to use')
args = parser.parse_args()
if not os.path.exists("datasets"):
os.makedirs("datasets")
os.chdir("datasets")
yahoo_id = "13azGlTuGdzWLCmgDmQPmvb_jcexVWX7i"
snli_id = "11NHEPxV7OrqmODozxQGezSU8093iUlUx"
short_yelp_id = "18h8UYr801qr-USCYRzZySi_DSbUkHW71"
if args.dataset == "yahoo":
file_id = [yahoo_id]
elif args.dataset == "snli":
file_id = [snli_id]
elif args.dataset == "short_yelp":
file_id = [short_yelp_id]
else:
file_id = [yahoo_id, snli_id, short_yelp_id]
destination = "datasets.tar.gz"
for file_id_e in file_id:
download_file_from_google_drive(file_id_e, destination)
tar = tarfile.open(destination, "r:gz")
tar.extractall()
tar.close()
os.remove(destination)
os.chdir("../")