-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_sample_data_folder.py
44 lines (34 loc) · 1.31 KB
/
generate_sample_data_folder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Generates a sample data folder to upload to the repo by copying the file structure of the actual data folder,
as well as the first 10000 lines of every large csv (>100mb), the full file for every small (<100mb) csv
and a random sample of 3 images in every image folder.
"""
import os
import shutil
import random
import pandas as pd
SOURCE_FOLDER = "C:/Users/User/Downloads/hanzi_data_final"
MAX_ROWS = 10000
NUM_SAMPLES = 20
random.seed(0)
def copy_file_samples(folder):
files = os.listdir(f"{SOURCE_FOLDER}/{folder}")
files = random.sample(files, k=min(len(files), NUM_SAMPLES))
os.mkdir(folder)
for f in files:
if "." in f:
file_source_path = f"{SOURCE_FOLDER}/{folder}/{f}"
file_dest_path = f"{folder}/{f}"
if "csv" in f or "tsv" in f:
size_mb = os.path.getsize(file_source_path) / 1048576
if size_mb < 100:
df = pd.read_csv(file_source_path, index_col=0)
else:
df = pd.read_csv(file_source_path, index_col=0).iloc[:MAX_ROWS]
df.to_csv(file_dest_path)
else:
shutil.copyfile(file_source_path, file_dest_path)
else:
copy_file_samples(f"{folder}/{f}")
if __name__ == "__main__":
copy_file_samples("data")