-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
276 lines (233 loc) · 9.77 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
from erddapy import ERDDAP
import pprint
from ast import literal_eval
import pandas as pd
import numpy as np
project = 'SAMBA'
basin = 'Bornholm Basin'
year = 2024
month = 4
def load_metadata():
server = "https://erddap.observations.voiceoftheocean.org/erddap"
e = ERDDAP(
server=server,
protocol="tabledap",
response="csv",
)
e.dataset_id = "meta_metadata_table"
metadata = e.to_pandas(
index_col="datasetID",
date_format="%f"
)
e.dataset_id = "allDatasets"
all_datasets = e.to_pandas(
index_col="datasetID",
date_format="%f"
)
def obj_to_string(x):
return pprint.pformat(x)
def variable_exists(x, variable):
return variable in x
def basin_simplify(basin):
if basin=='Bornholm Basin, Arkona Basin':
return 'Bornholm Basin'
if basin=='Eastern Gotland Basin, Northern Baltic Proper':
return 'Eastern Gotland'
if basin=='Northern Baltic Proper, Eastern Gotland Basin':
return 'Eastern Gotland'
elif basin=='Western Gotland Basin':
return 'Western Gotland'
elif basin=='Eastern Gotland Basin':
return 'Eastern Gotland'
elif basin=='Western Gotland Basin, Eastern Gotland Basin':
return 'Western Gotland'
elif basin=='Kattegat':
return 'Skagerrak, Kattegat'
elif basin=='Kattegat, Skagerrak':
return 'Skagerrak, Kattegat'
elif basin=='Skagerrak':
return 'Skagerrak, Kattegat'
elif basin=='Northern Baltic Proper':
return 'Eastern Gotland'
return 'Skagerrak, Kattegat'
elif basin=='\\u00c3\\u0085land Sea':
return 'Åland Sea'
elif basin=='\\u00c5land Sea':
return 'Åland Sea'
else:
return basin
metadata['optics_serial'] = metadata.optics_serial.apply(obj_to_string)
metadata['irradiance_serial'] = metadata.irradiance_serial.apply(obj_to_string)
metadata['altimeter_serial'] = metadata.altimeter_serial.apply(obj_to_string)
metadata['glider_serial'] = metadata.glider_serial.apply(obj_to_string)
metadata['basin'] = metadata.basin.apply(basin_simplify)
metadata['time_coverage_end (UTC)'] = pd.to_datetime(metadata['time_coverage_end (UTC)'])
metadata['time_coverage_start (UTC)'] = pd.to_datetime(metadata['time_coverage_start (UTC)'])
return metadata, all_datasets
def variable_exists(x, variable):
# import pdb; pdb.set_trace();
return variable in x
def create_available_variables_columns(metadata):
# create list of all variables
all_variables_set = set()
menuentries = []
menuentries_variables = []
newmetadatacolumns = {}
for index in range(0,len(metadata.index)):
all_variables_set.update(literal_eval(metadata.iloc[index].variables))
all_variables_set
for variable in list(all_variables_set):
newmetadatacolumns[variable+'_available'] = metadata.variables.apply(variable_exists, args=(variable,))
menuentries.append({'label':variable+'_available', 'value':variable+'_available'})
#menuentries_variables.append({'label':variable,variable+'_available' 'value':variable})
metadata = metadata.join(pd.DataFrame.from_dict(newmetadatacolumns))
return metadata
def filter_metadata():
# Better to return filtered DataFrame instead of IDs?
mode = 'all' # 'nrt', 'delayed'
metadata, all_datasets = load_metadata()
metadata = metadata[
metadata['project'].isin(['SAMBA']) #&
#(metadata['project']==project) #&
#(metadata['basin']==basin) &
#(metadata['basin']=='Åland Sea') &
#(metadata['time_coverage_start (UTC)'].dt.year>2023) &
#(metadata['time_coverage_start (UTC)'].dt.year==year) &
#(metadata['time_coverage_start (UTC)'].dt.month==month)
#(metadata['time_coverage_start (UTC)'].dt.day<15)
]
#for basins
#metadata = drop_overlaps(metadata)
return metadata, all_datasets
def add_delayed_dataset_ids(metadata, all_datasets):
nrt_dataset_ids = list(metadata.index)
delayed_dataset_ids = [
datasetid.replace('nrt', 'delayed') if datasetid.replace('nrt', 'delayed') in all_datasets.index else datasetid
for datasetid in metadata.index]
all_dataset_ids = nrt_dataset_ids+delayed_dataset_ids
return all_dataset_ids#metadata.loc[all_dataset_ids]
def drop_overlaps(metadata):
drop_overlap=True
dropped_datasets = []
for basin in ['Bornholm Basin', 'Skagerrak, Kattegat',
'Western Gotland', 'Eastern Gotland', 'Åland Sea']:
meta = metadata[metadata['basin']==basin]
for index in range(0, len(meta)):
glidercounter = 1
maskedregions = []
color = 'k'
for index2 in range(0, index):
r1 = dict(start=meta.iloc[index]['time_coverage_start (UTC)'],
end=meta.iloc[index]['time_coverage_end (UTC)'])
r2 = dict(start=meta.iloc[index2]['time_coverage_start (UTC)'],
end=meta.iloc[index2]['time_coverage_end (UTC)'])
latest_start = max(r1['start'], r2['start'])
earliest_end = min(r1['end'], r2['end'])
delta = (earliest_end - latest_start).days + 1
overlap = max(0, delta)
if overlap > 1:
glidercounter += 1
# if two Glider datasets are overlapping by more than a
# day, they are plotted in multiple rows...
if drop_overlap:
# ...and optionally dropped
dropped_datasets.append(meta.index[index])
color = 'red'
#print('dropping datasets {}'.format(dropped_datasets))
metadata = metadata.drop(dropped_datasets)
return metadata
def drop_overlaps_fast(metadata):
with pd.option_context('mode.chained_assignment', None):
metadata['duration'] = metadata['time_coverage_end (UTC)'] - metadata['time_coverage_start (UTC)']
metadata['startdate'] = metadata['time_coverage_start (UTC)'].dt.date
remaining = metadata.sort_values(['startdate', 'duration'], ascending=[True, False])[['startdate']].drop_duplicates().index
# import pdb; pdb.set_trace();
return metadata.loc[remaining]
def voto_seaexplorer_dataset(ds):
"""
Adapts a VOTO xarray dataset, for example downloaded from the VOTO ERDAP
server (https://erddap.observations.voiceoftheocean.org/erddap/index.html)
to be used in GliderTools
Parameters
----------
ds : xarray.Dataset
Returns
-------
xarray.Dataset
Dataset containing the all columns in the source file and dives column
"""
ds = add_dive_column(ds)
return ds
# this is a version were I only change the profile_nums, to try if no-concatenation helps with datashader performance
def voto_concat_datasets(datasets):
"""
Concatenates multiple datasets along the time dimensions, profile_num
and dives variable(s) are adapted so that they start counting from one
for the first dataset and monotonically increase.
Parameters
----------
datasets : list of xarray.Datasets
Returns
-------
xarray.Dataset
concatenated Dataset containing all the data from the list of datasets
"""
# in case the datasets have a different set of variables, emtpy variables are created
# to allow for concatenation (concat with different set of variables leads to error)
#mlist = [set(dataset.variables.keys()) for dataset in datasets]
#allvariables = set.union(*mlist)
#for dataset in datasets:
# missing_vars = allvariables - set(dataset.variables.keys())
# for missing_var in missing_vars:
# dataset[missing_var] = np.nan
# renumber profiles, so that profile_num still is unique in concat-dataset
for index in range(1, len(datasets)):
datasets[index]["profile_num"] += (
datasets[index - 1].copy()["profile_num"].max()
)
return datasets
def voto_concat_datasets2(datasets):
import dask
import dask.dataframe as dd
"""
Concatenates multiple datasets along the time dimensions, profile_num
and dives variable(s) are adapted so that they start counting from one
for the first dataset and monotonically increase.
Parameters
----------
datasets : list of xarray.Datasets
Returns
-------
xarray.Dataset
concatenated Dataset containing all the data from the list of datasets
"""
# in case the datasets have a different set of variables, emtpy variables are created
# to allow for concatenation (concat with different set of variables leads to error)
mlist = [set(dataset.variables.keys()) for dataset in datasets]
allvariables = set.union(*mlist)
#for dataset in datasets:
# missing_vars = allvariables - set(dataset.variables.keys())
# for missing_var in missing_vars:
# dataset[missing_var] = np.nan
# renumber profiles, so that profile_num still is unique in concat-dataset
#for index in range(1, len(datasets)):
# datasets[index]["profile_num"] += (
# datasets[index - 1].copy()["profile_num"].max()
# )
ds = dd.concat(datasets, dim="time", variables=["temperature", "salinity"])#xr.concat(datasets, dim="time")
#ds = add_dive_column(ds)
return ds
#def dask_add_dives(profile_nu):
def add_dive_column(ds):
"""add dive column to dataset
Parameters:
-----------
ds: xarray.Dataset
Returns:
--------
xarray.Dataset
Dataset containing a dives column
"""
#ds["dives"] = np.where(ds.profile_direction == 1, ds.profile_num, ds.profile_num + 0.5)
ds["dives"] = ds.profile_num.where(ds.profile_direction==1, ds.profile_num+0.5)
return ds