forked from sebasmos/MetaDengue
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dataset.py
205 lines (182 loc) · 11.6 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import os
from os import walk
import json
import pandas as pd
import epiweeks
from epiweeks import Week
from datetime import date as convert_to_date
import numpy as np
import config
import shutil
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def create_dataset(root, source):
"""
This function creates DATASET/ and annotations/ folder.
Parameters:
root(string) - path to dataset folder
"""
dest = os.path.join(root, "images")
shutil.copytree(source, dest)
os.makedirs(os.path.join(root, "annotations"), exist_ok=True)
def get_epiweek(image_name):
"""
This function obtains the epiweek given an image name, which
normally contains the date when the image was taken.
Parameters:
image_name(string) - image name containing collection week
Returns an integer indicating the corresponding epiweek.
epiweek(int) - epiweek for corresponding month and year
"""
date = image_name.split('-')
# Get year as int
year = ''.join(filter(str.isdigit, date[0]))
year = int(year)
# Get month as int
month = ''.join(filter(str.isdigit, date[1]))
month = int(month)
# Get day as int
day = ''.join(filter(str.isdigit, date[2]))
day = int(day)
# Get epiweek:
date = convert_to_date(year, month, day)
epiweek = str(Week.fromdate(date))
epiweek = int(epiweek)
return epiweek
def get_label(path):
# Get city:
city = path.split('/')[1]
# Get epiweek:
date = path.split('/')[2]
epiweek = get_epiweek(date)
# Get cases:
cases = int(df[df['Municipality'] == city].loc[:,epiweek])
return cases
def run():
root = config.root
source = config.source
create_dataset(root, source)
climatic_data_path = config.climatic_data_path
socioeco_data_path = config.socioeco_data_path
binary_classification = config.binary_classification
incidence_ratio = config.incidence_ratio
multiclass_labels = config.multiclass_labels
# Read files
data = pd.read_csv(climatic_data_path)
socioeco_data = pd.read_csv(socioeco_data_path)
binary_classification = pd.read_csv(binary_classification)
incidence_ratio = pd.read_csv(incidence_ratio)
multiclass_labels = pd.read_csv(multiclass_labels)
# We select indexes 468-624 to filter based on Satellite images availabilty: image_2016-01-01 - 2018-12-23
# 459-624 to inclide since 2015-11-1
data = data[468:624].reset_index(drop=True)
# Create data folder
f = os.listdir(source)
print(f"Collected data for processing: ", f)
for idx in range(0,len(f)): # getting 1 image for 1 epiweek
folder = os.path.join(source, f[idx])
code_per_image = int(f[idx])
print(f"Processing municipality {code_per_image}")
folder_name = f[idx]
images = sorted(os.listdir(folder))
counter = 0
counter_bv = 0
for img in images:
counter_bv +=1
image_path = os.path.join("DATASET", "images", folder_name, img)
date_img = get_epiweek(img) # image date
date_csv = [get_epiweek(date) for date in data.date]
data["indexer"] = date_csv
print(f"{counter}/{len(images)}")
if date_img in date_csv:
# Number of cases and environmental data
row = data[data["indexer"]==date_img]
# Binary classification labels
col_bin_cases = binary_classification[["Municipality code", str(date_img)]]
# Incidence_ratio
col_incidence_ratio_cases = incidence_ratio[["Municipality code", str(date_img)]]
# Multilabel
col_multiclass_cases = multiclass_labels[["Municipality code", str(date_img)]]
#
full_codes= row.columns
for e in full_codes:
name = e.split("_")[0]
if len(e.split("_"))>1:
code = int(e.split("_")[1])
if code == code_per_image:
#### Obtain socio-Economical data - Downsampling because data is per-year-sampled :/
socioeco_row = socioeco_data[socioeco_data["Municipality code"]==code]
year = int(str(date_img)[:4]) # Only obtain year to downsample with it
name = "Population"+str(year)
#### Binary cases
cases_bin = int(col_bin_cases[col_bin_cases["Municipality code"]==int(code)][str(date_img)])
#### Incidence ratio
cases_incidence = int(col_incidence_ratio_cases[col_incidence_ratio_cases["Municipality code"]==int(code)][str(date_img)])
#### Multi-label
cases_multiclass_labels = int(col_multiclass_cases[col_multiclass_cases["Municipality code"]==int(code)][str(date_img)])
## Create JSON file
anns_folder = os.path.join(root, "annotations", folder_name)
os.makedirs(anns_folder, exist_ok=True)
anns_path = os.path.join(root, anns_folder, image_path.split("/")[-1:][0][:-5] + ".json")
out_file = open(anns_path, "w")
if len(str(code))<5:
code = "0"+ str(code)
counter+=1
#assert counter_bv ==counter, "Different fectchers"
annotation = {
"image_path": image_path,
"municipality_code": int(code),
"epiweeks": date_img,
"dynamic":{
"cases" :{
"dengue_cases": (np.array(row["cases" + "_" + str(int(code))])).tolist()[0], # how to get the index, given that we have the column and the the date
"binary_classification":cases_bin,
#"incidence_rate": cases_incidence,
"multiclass": cases_multiclass_labels,
},
"environmental_data": { "temperature": (np.array(row["temperature" + "_" + str(code)]).tolist()),
"precipitation": (np.array(row["precipitation" + "_" + str(code)])).tolist(),
},
"socioeconomic_data":{
"Population": [int(socioeco_row[name]) if name in socioeco_row.columns else 0][0],
}
},
"static":
{
"environmental_data": {
"elevation": float(socioeco_row["Elevation"])
},
"socioeconomic_data": {
"Age0-4(%)": float(socioeco_row["Age0-4(%)"]),
'Age5-14(%)':float(socioeco_row["Age15-29(%)"]),
'Age>30(%)':float(socioeco_row["Age>30(%)"]),
'AfrocolombianPopulation(%)':float(socioeco_row["AfrocolombianPopulation(%)"]),
'IndianPopulation(%)':float(socioeco_row["IndianPopulation(%)"]),
'PeoplewithDisabilities(%)':float(socioeco_row["PeoplewithDisabilities(%)"]),
'Peoplewhocannotreadorwrite(%)':float(socioeco_row["Peoplewhocannotreadorwrite(%)"]),
'Secondary/HigherEducation(%)':float(socioeco_row["Secondary/HigherEducation(%)"]),
'Employedpopulation(%)':float(socioeco_row["Employedpopulation(%)"]),
'Unemployedpopulation(%)':float(socioeco_row["Unemployedpopulation(%)"]),
'Unemployedpopulation(%)':float(socioeco_row["Unemployedpopulation(%)"]),
'Peopledoinghousework(%)':float(socioeco_row["Peopledoinghousework(%)"]),
'Men(%)':float(socioeco_row["Men(%)"]),
'Women(%)':float(socioeco_row["Women(%)"]),
'Householdswithoutwateraccess(%)':float(socioeco_row["Householdswithoutwateraccess(%)"]),
'Householdswithoutinternetaccess(%)':float(socioeco_row["Householdswithoutinternetaccess(%)"]),
'Buildingstratification1(%)':float(socioeco_row["Buildingstratification1(%)"]),
'Buildingstratification2(%)':float(socioeco_row["Buildingstratification2(%)"]),
'Buildingstratification3(%)':float(socioeco_row["Buildingstratification3(%)"]),
'Buildingstratification4(%)':float(socioeco_row["Buildingstratification4(%)"]),
'Buildingstratification5(%)':float(socioeco_row["Buildingstratification5(%)"]),
'Buildingstratification6(%)':float(socioeco_row["Buildingstratification6(%)"]),
'NumberofhospitalsperKm2':float(socioeco_row["NumberofhospitalsperKm2"]),
'NumberofhousesperKm2':float(socioeco_row["NumberofhousesperKm2"]),
}
}
}
#print(json.dumps(annotation))
json.dump(annotation, out_file, indent=6)
out_file.close()
#assert counter == len(images)
print("Done.")
run()