-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.py
611 lines (534 loc) · 33.3 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
import os
import glob
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import mne
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split
import torch
def prepareUCR(dataset_dir: str, fill_missing_and_variable=True, normalize=False, test_ratio=0.3, grain=10000):
"""Prepare UCR sub-datasets and merge them into one united dataset.
Download the original dataset archive from https://www.cs.ucr.edu/~eamonn/time_series_data_2018/ and
extract it in the `dataset_dir` directory.
Parameters:
dataset_dir (str): directory of UCR archive
fill_missing_and_variable (bool): whether to use repaired values provided by UCR-128 archive
normalize (bool): whether to normalize sub-datasets by its mean and std when merging
test_ratio (float): test dataset ratio
grain (int): grain size of splitting the huge merged dataset into multiple files
"""
assert os.path.exists(dataset_dir), f'UCR archive directory `{dataset_dir}` does not exist!'
# delete these sub-datasets to merge the whole UCR archive without duplication
DEL_DATASETS = [
'Chinatown', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect',
'DodgerLoopGame', 'DodgerLoopWeekend', 'FreezerSmallTrain', 'GesturePebbleZ2',
'GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'InsectEPGSmallTrain',
'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MixedShapesSmallTrain',
'PhalangesOutlinesCorrect', 'PickupGestureWiimoteZ', 'ProximalPhalanxOutlineAgeGroup',
'ProximalPhalanxOutlineCorrect', 'SemgHandGenderCh2', 'SemgHandSubjectCh2',
'ShakeGestureWiimoteZ', 'WormsTwoClass'
]
root = os.path.abspath(os.path.join(dataset_dir, '..'))
save_dir = os.path.join(root, 'ucr-128')
merge_dir, merge_split_dir = os.path.join(root, 'ucr-merged'), os.path.join(root, 'ucr-merged-split')
MVVL_DIR = 'Missing_value_and_variable_length_datasets_adjusted'
MVVL_DATASETS = os.listdir(os.path.join(dataset_dir, MVVL_DIR))
DATASETS = os.listdir(dataset_dir)
DATASETS.remove(MVVL_DIR)
assert len(DATASETS) == 128, f'UCR archive directory `{dataset_dir}` should contain 128 sub-datasets, but found {len(DATASETS)}!'
all_data = pd.DataFrame() # split data into train and test randomly
train_split, test_split = pd.DataFrame(), pd.DataFrame() # split data into train and test by the original dataset
for d in tqdm(sorted(DATASETS)):
if d in MVVL_DATASETS and fill_missing_and_variable:
# missing values are treated with linear interpolation
# variable length are treated with low-amplitude noise padding
df_train = pd.read_csv(os.path.join(dataset_dir, MVVL_DIR, d, f'{d}_TRAIN.tsv'), sep='\t', header=None)
df_test = pd.read_csv(os.path.join(dataset_dir, MVVL_DIR, d, f'{d}_TEST.tsv'), sep='\t', header=None)
else:
df_train = pd.read_csv(os.path.join(dataset_dir, d, f'{d}_TRAIN.tsv'), sep='\t', header=None)
df_test = pd.read_csv(os.path.join(dataset_dir, d, f'{d}_TEST.tsv'), sep='\t', header=None)
# prepare data in sub-datasets, i.e., change category names and file suffix (.tsv -> .csv)
data = pd.concat([df_train, df_test], axis=0)
if d in ['FordA', 'FordB']: # merge categories in the two datasets
new_idx = {sorted(data.iloc[:, 0].unique())[i]: f'Ford_{i:02d}' for i in range(len(data.iloc[:, 0].unique()))}
else:
new_idx = {sorted(data.iloc[:, 0].unique())[i]: f'{d}_{i:02d}' for i in range(len(data.iloc[:, 0].unique()))}
data.iloc[:, 0] = data.iloc[:, 0].map(new_idx)
df_train.iloc[:, 0] = df_train.iloc[:, 0].map(new_idx)
df_test.iloc[:, 0] = df_test.iloc[:, 0].map(new_idx)
os.makedirs(os.path.join(save_dir, d), exist_ok=True)
df_train.columns, df_test.columns = [str(i) for i in range(df_train.shape[1])], [str(i) for i in range(df_test.shape[1])]
df_train.reset_index(drop=True).to_feather(os.path.join(save_dir, d, f'train.feather'))
df_test.reset_index(drop=True).to_feather(os.path.join(save_dir, d, f'test.feather'))
# merge sub-datasets
if d not in DEL_DATASETS:
if normalize: # normalize each sub-dataset by its mean and std of the whole dataset
data_mean, data_std = data.iloc[:, 1:].values.mean(), data.iloc[:, 1:].values.std()
data.iloc[:, 1:] = (data.iloc[:, 1:] - data_mean) / (data_std + 1e-8)
all_data = pd.concat([all_data, data], axis=0, sort=True)
if normalize: # normalize each sub-dataset by its mean and std of the training dataset for the split version
train_mean, train_std = df_train.iloc[:, 1:].values.mean(), df_train.iloc[:, 1:].values.std()
df_train.iloc[:, 1:] = (df_train.iloc[:, 1:] - train_mean) / (train_std + 1e-8)
df_test.iloc[:, 1:] = (df_test.iloc[:, 1:] - train_mean) / (train_std + 1e-8)
train_split, test_split = pd.concat([train_split, df_train], axis=0, sort=True), pd.concat([test_split, df_test], axis=0, sort=True)
# randomly split data into train and test
samples_train, samples_test, targets_train, targets_test = train_test_split(
all_data.iloc[:, 1:], all_data.iloc[:, 0],
test_size=test_ratio,
random_state=42, shuffle=True,
stratify=all_data.iloc[:, 0]
)
train = pd.concat([targets_train, samples_train], axis=1)
test = pd.concat([targets_test, samples_test], axis=1)
print(f'Saving processed data: {len(train)}/{len(train_split)} samples in train/train_split set; {len(test)}/{len(test_split)} samples in test/test_split set')
os.makedirs(merge_dir, exist_ok=True)
os.makedirs(merge_split_dir, exist_ok=True)
# feather file requires column names in string format and drop index
train.columns, test.columns = [str(i) for i in range(train.shape[1])], [str(i) for i in range(test.shape[1])]
train_split.columns, test_split.columns = [str(i) for i in range(train_split.shape[1])], [str(i) for i in range(test_split.shape[1])]
# random split data into train and test
train.sample(frac=1, random_state=42) # shuffle the data
test.sample(frac=1, random_state=42) # shuffle the data
for i in tqdm(range(train.shape[0]//grain + 1)):
end = min((i+1)*grain, train.shape[0])
train.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_dir, f'train_{i}.feather'))
for i in tqdm(range(test.shape[0]//grain + 1)):
end = min((i+1)*grain, test.shape[0])
test.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_dir, f'test_{i}.feather'))
train.reset_index(drop=True).to_feather(os.path.join(merge_dir, 'train.feather'))
test.reset_index(drop=True).to_feather(os.path.join(merge_dir, 'test.feather'))
# split data into train and test followed the original archive
train_split.sample(frac=1, random_state=42) # shuffle the data
test_split.sample(frac=1, random_state=42) # shuffle the data
for i in tqdm(range(train_split.shape[0]//grain + 1)):
end = min((i+1)*grain, train_split.shape[0])
train_split.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_split_dir, f'train_{i}.feather'))
for i in tqdm(range(test_split.shape[0]//grain + 1)):
end = min((i+1)*grain, test_split.shape[0])
test_split.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_split_dir, f'test_{i}.feather'))
train_split.reset_index(drop=True).to_feather(os.path.join(merge_split_dir, 'train.feather'))
test_split.reset_index(drop=True).to_feather(os.path.join(merge_split_dir, 'test.feather'))
def prepareUEA(dataset_dir: str, normalize=False, grain=10000):
"""Prepare UEA sub-datasets and merge them into one unlabeled dataset.
Download the original dataset archive from http://www.timeseriesclassification.com/dataset.php and
extract it in the `dataset_dir` directory.
Parameters:
dataset_dir (str): directory of UEA archive
normalize (bool): whether to normalize sub-datasets by its mean and std when merging
grain (int): grain size of splitting the huge merged dataset into multiple files
"""
assert os.path.exists(dataset_dir), f'UEA archive directory `{dataset_dir}` does not exist!'
DATASETS = os.listdir(dataset_dir)
DATASETS = list(set(DATASETS) - set(['Descriptions', 'Images', 'DataDimensions.csv', 'UEAArchive2018.pdf'])) # remove unnecessary files
assert len(DATASETS) == 30, f'UEA archive directory `{dataset_dir}` should contain 30 sub-datasets, but found {len(DATASETS)}!'
root = os.path.abspath(os.path.join(dataset_dir, '..'))
save_dir, merge_dir = os.path.join(root, 'uea-30'), os.path.join(root, 'uea-merged')
all_samples = pd.DataFrame()
for d in tqdm(sorted(DATASETS)):
train = loadarff(os.path.join(dataset_dir, d, f'{d}_TRAIN.arff'))[0]
samples, labels = [], []
for sample, label in train: # instance-loop
sample = np.array([s.tolist() for s in sample]) # (d, l)
label = label.decode("utf-8") # (1,)
samples.append(sample)
labels.append(label)
samples = torch.tensor(samples, dtype=torch.float) # (n, d, l)
classes = set(labels) # unique labels
class_to_idx = {c: i for i, c in enumerate(classes)} # map label (string) to index (int)
labels = torch.tensor([class_to_idx[l] for l in labels], dtype=torch.long) # (n,)
data = {'samples': samples, 'labels': labels}
test = loadarff(os.path.join(dataset_dir, d, f'{d}_TEST.arff'))[0]
samples_test, labels_test = [], []
for sample, label in test: # instance-loop
sample = np.array([s.tolist() for s in sample]) # (d, l)
label = label.decode("utf-8") # (1,)
samples_test.append(sample)
labels_test.append(label)
samples_test = torch.tensor(samples_test, dtype=torch.float) # (n, d, l)
labels_test = torch.tensor([class_to_idx[l] for l in labels_test], dtype=torch.long) # (n,)
data_test = {'samples': samples_test, 'labels': labels_test}
# save multi-dimensional data as `.pt` file
os.makedirs(os.path.join(save_dir, d), exist_ok=True)
torch.save(data, os.path.join(save_dir, d, f'train.pt'))
torch.save(data_test, os.path.join(save_dir, d, f'test.pt'))
if d not in ['EigenWorms', 'FaceDetection', 'InsectWingbeat', 'PenDigits']: # do not merge these datasets for some reasons
if normalize: # normalize data by training set mean and std per channel
train_mean, train_std = samples.mean(dim=(0, 2)), samples.std(dim=(0, 2))
if torch.isnan(train_mean).any() or torch.isnan(train_std).any(): # NaN in the dataset
train_mean, train_std = np.nanmean(samples.numpy(), axis=(0, 2)), np.nanstd(samples.numpy(), axis=(0, 2))
train_mean = torch.as_tensor(train_mean, dtype=torch.float).view(-1, 1)
train_std = torch.as_tensor(train_std, dtype=torch.float).view(-1, 1)
samples = (samples - train_mean) / (train_std + 1e-8)
samples_test = (samples_test - train_mean) / (train_std + 1e-8)
samples_flat = samples.reshape(-1, samples.shape[-1]).numpy() # (n*d, l)
samples_test_flat = samples_test.reshape(-1, samples_test.shape[-1]).numpy() # (n*d, l)
all_samples = pd.concat([all_samples, pd.DataFrame(samples_flat), pd.DataFrame(samples_test_flat)])
# save uni-dimensional data as `.feather` file
os.makedirs(os.path.join(merge_dir), exist_ok=True)
all_samples.columns = [str(i) for i in range(all_samples.shape[1])]
all_samples.sample(frac=1, random_state=42) # shuffle the data
for i in tqdm(range(all_samples.shape[0]//grain + 1)):
end = min((i+1)*grain, all_samples.shape[0])
all_samples.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_dir, f'train_{i}.feather'))
all_samples.reset_index(drop=True).to_feather(os.path.join(merge_dir, 'train.feather'))
def prepareSleepEDF(dataset_dir: str, subdataset_name='age', channel='EEG Fpz-Cz', num_records=39, split_level='subject', test_ratio=0.2, grain=10000):
"""Prepare Sleep-EDF dataset.
Download the original dataset from https://www.physionet.org/content/sleep-edfx/1.0.0/ and
extract it in the `dataset_dir` directory.
Parameters:
dataset_dir (str): directory of Sleep-EDF dataset
subdataset_name (str): name of subdataset to be processed (`age` or `temazepam`)
channel (str): channel to be processed (`EEG Fpz-Cz` or `EEG Pz-Oz`)
num_records (int | None): number of records to be processed (default: 39, i.e., the first 20 subjects included in the older version of the dataset)
split_level (str): level of split (`subject` or `record`)
test_ratio (float): test dataset ratio
grain (int): grain size of splitting the huge merged dataset into multiple files
"""
assert os.path.exists(dataset_dir), f'Sleep-EDF directory `{dataset_dir}` does not exist!'
assert subdataset_name in ['age', 'temazepam'], f'Parameter `subdataset_name` must be one of `age` or `temazepam`, but got {subdataset_name}!'
if subdataset_name == 'age':
assert channel in ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal'], \
'Parameter `channel` must be one of `EEG Fpz-Cz`, `EEG Pz-Oz` or `EOG horizontal`, when `subdataset_name` is `age`!'
else: # subdataset_name == 'temazepam'
assert channel in ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', 'EMG submental'], \
'Parameter `channel` must be one of `EEG Fpz-Cz`, `EEG Pz-Oz`, `EOG horizontal`, or `EMG submental`, when `subdataset_name` is `temazepam`!'
root = os.path.abspath(os.path.join(dataset_dir, '..'))
name = 'sleep-edf'
if subdataset_name != 'age': name += f'_temazepam'
if channel != 'EEG Fpz-Cz': name += f'_{channel.lower().replace(" ", "_")}'
if num_records is None: name += '_all'
save_dir = os.path.join(root, name)
save_record_dir = os.path.join(save_dir, 'records')
os.makedirs(save_record_dir, exist_ok=True)
save_subject_dir = os.path.join(save_dir, 'subjects')
os.makedirs(save_subject_dir, exist_ok=True)
if subdataset_name == 'age':
dataset_dir = os.path.join(dataset_dir, 'sleep-cassette')
else: # if subdataset_name == 'temazepam'
dataset_dir = os.path.join(dataset_dir, 'sleep-telemetry')
psg_fnames = sorted(glob.glob(os.path.join(dataset_dir, "*PSG.edf")))
ann_fnames = sorted(glob.glob(os.path.join(dataset_dir, "*Hypnogram.edf")))
assert len(psg_fnames) == len(ann_fnames), f'Number of PSG and annotation files do not match: {len(psg_fnames)} vs. {len(ann_fnames)}!'
if num_records is None:
num_records = len(psg_fnames)
else: # if num_records is not None
assert num_records <= len(psg_fnames), f'Parameter `num_records` must be less than or equal to {len(psg_fnames)}, but got {num_records}!'
# subject-level split
subjects, records, prev_sub_id = [], [], ''
labels = { # follow AASM standard: merge S3 and S4 stages into a single N3 stage
'Sleep stage W':f'{name}_0', 'Sleep stage 1':f'{name}_1', 'Sleep stage 2':f'{name}_2',
'Sleep stage 3':f'{name}_3', 'Sleep stage 4':f'{name}_3', 'Sleep stage R':f'{name}_4',
}
for i in tqdm(range(num_records)):
# merge the records for the same subject
sub_id = psg_fnames[i].split('/')[-1][3:5] # e.g., '07' for 'SC4071E0-PSG.edf'
if split_level == 'subject' and sub_id != prev_sub_id: # a new subject
# save one subject
if len(records) > 0:
subject = pd.concat(records, axis=0) # merge all records for the same subject
subject.to_csv(os.path.join(save_subject_dir, f'subject_{prev_sub_id}.csv'), header=False, index=False)
subjects.append(subject)
records, prev_sub_id = [], sub_id
# read PSG and annotation files
raw_data = mne.io.read_raw_edf(psg_fnames[i], preload=True)
annotations = mne.read_annotations(ann_fnames[i])
raw_data.set_annotations(annotations, emit_warning=False)
# keep last 30-min wake events before sleep and first 30-min wake events after sleep, and redefine annotations on raw data
for i in range(len(annotations)):
if annotations[i]['description'][12] in ['1', '2', '3', '4', 'R']: break # i: first non-weak stage
for j in range(len(annotations)-1, -1, -1):
if annotations[j]['description'][12] in ['1', '2', '3', '4', 'R']: break # j: last non-weak stage
annotations.crop(annotations[i]['onset'] - 30*60, annotations[j+1]['onset'] + 30*60)
raw_data.set_annotations(annotations, emit_warning=False)
try:
# keep only interested (sleep) events
event_id = {'Sleep stage W':0, 'Sleep stage 1':1, 'Sleep stage 2':2, 'Sleep stage 3':3, 'Sleep stage 4':4, 'Sleep stage R':5}
events, _ = mne.events_from_annotations(raw_data, event_id=event_id, chunk_duration=30.)
# split PSG data into several 30-s epochs; sample rate is 100Hz, so 30*100 = 3000 samples per epoch
tmax = 30. - 1./raw_data.info['sfreq'] # tmax in included
epochs = mne.Epochs(raw=raw_data, events=events, event_id=event_id, tmin=0., tmax=tmax, baseline=None, preload=True)
except ValueError: # if sleep stage 4 are not found
try:
event_id = {'Sleep stage W':0, 'Sleep stage 1':1, 'Sleep stage 2':2, 'Sleep stage 3':3, 'Sleep stage R':5}
events, _ = mne.events_from_annotations(raw_data, event_id=event_id, chunk_duration=30.)
tmax = 30. - 1./raw_data.info['sfreq']
epochs = mne.Epochs(raw=raw_data, events=events, event_id=event_id, tmin=0., tmax=tmax, baseline=None, preload=True)
except ValueError: # if sleep stage 3 are not found
event_id = {'Sleep stage W':0, 'Sleep stage 1':1, 'Sleep stage 2':2, 'Sleep stage R':5}
events, _ = mne.events_from_annotations(raw_data, event_id=event_id, chunk_duration=30.)
tmax = 30. - 1./raw_data.info['sfreq']
epochs = mne.Epochs(raw=raw_data, events=events, event_id=event_id, tmin=0., tmax=tmax, baseline=None, preload=True)
df = epochs.to_data_frame().loc[:, ['condition', 'epoch', channel]]
df['timestamp'] = np.tile(np.arange(3000), len(df)//3000)
# convert `channel` data in each epoch into one row and concatenate these epochs as one record
samples = pd.pivot(df, values=channel, columns='timestamp', index='epoch') # (#epochs, 3000)
targets = pd.pivot(df, values='condition', columns='timestamp', index='epoch').iloc[:, 0:1] # (#epochs, 1)
record = pd.concat([targets, samples], axis=1) # (#epochs, 3001)
# save one record
record_id = os.path.basename(psg_fnames[i]).split('-')[0]
record.replace(labels, inplace=True)
record.to_csv(os.path.join(save_record_dir, f'{i:03d}_{record_id}.csv'), header=False, index=False)
# add this record to the list
records.append(record)
# save the last subject
if len(records) > 0:
subject = pd.concat(records, axis=0) # merge all records for the same subject
subject.to_csv(os.path.join(save_subject_dir, f'subject_{prev_sub_id}.csv'), header=False, index=False)
subjects.append(subject)
if num_records is None:
if subdataset_name == 'age':
assert len(records) == 153, f'Number of records in Sleep-EDF_sleep-cassette is not 153, but {len(records)}!'
else: # if subdataset_name == 'temazepam'
assert len(records) == 44, f'Number of records in Sleep-EDF_sleep-telemetry is not 44, but {len(records)}!'
random.seed(42)
if split_level == 'subject':
random.shuffle(subjects)
train_subjects, test_subjects = subjects[:int(len(subjects)*(1-test_ratio) + 0.5)], subjects[int(len(subjects)*(1-test_ratio) + 0.5):]
train_df, test_df = pd.concat(train_subjects, axis=0), pd.concat(test_subjects, axis=0)
else: # if split_level == 'record'
random.shuffle(records)
train_records, test_records = records[:int(len(records)*(1-test_ratio) + 0.5)], records[int(len(records)*(1-test_ratio) + 0.5):]
train_df, test_df = pd.concat(train_records, axis=0), pd.concat(test_records, axis=0)
# feather file requires column names in string format and drop index
train_df.columns, test_df.columns = [str(i) for i in range(train_df.shape[1])], [str(i) for i in range(test_df.shape[1])]
train_df.sample(frac=1, random_state=42) # shuffle the data
test_df.sample(frac=1, random_state=42) # shuffle the data
for i in tqdm(range(train_df.shape[0]//grain + 1)):
end = min((i+1)*grain, train_df.shape[0])
train_df.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(save_dir, f'train_{i}.feather'))
for i in tqdm(range(test_df.shape[0]//grain + 1)):
end = min((i+1)*grain, test_df.shape[0])
test_df.iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(save_dir, f'test_{i}.feather'))
train_df.reset_index(drop=True).to_feather(os.path.join(save_dir, 'train.feather'))
test_df.reset_index(drop=True).to_feather(os.path.join(save_dir, 'test.feather'))
def prepareEpilepsy(dataset_dir: str, val_ratio=0.1, test_ratio=0.1, binary_class=False):
"""Prepare Epilepsy Serizure dataset.
Download the original dataset from https://raw.githubusercontent.com/emadeldeen24/TS-TCC/main/data_preprocessing/epilepsy/data_files/data.csv and
move it to the `dataset_dir` directory.
The original dataset is at https://archive.ics.uci.edu/ml/datasets/Epileptic+Seizure+Recognition, which is recently removed for some reason.
Parameters:
dataset_dir (str): directory of Epilepsy Serizure dataset
test_ratio (float): test dataset ratio
"""
assert os.path.exists(dataset_dir), f'Epilepsy Serizure directory `{dataset_dir}` does not exist!'
root = os.path.abspath(os.path.join(dataset_dir, '..'))
save_dir = os.path.join(root, 'epilepsy') if not binary_class else os.path.join(root, 'epilepsy_binary')
# load data
data = pd.read_csv(os.path.join(dataset_dir, 'data.csv'), index_col=0)
# split data into train, val and test
samples_train, samples_test, targets_train, targets_test = train_test_split(
data.iloc[:, :-1], data.iloc[:, -1],
test_size=val_ratio+test_ratio,
random_state=42, shuffle=True,
stratify=data.iloc[:, -1]
)
samples_val, samples_test, targets_val, targets_test = train_test_split(
samples_test, targets_test,
test_size=test_ratio/(val_ratio + test_ratio),
random_state=42, shuffle=True
)
if binary_class:
labels = {1:'epilepsy_binary_0', 2:'epilepsy_binary_1', 3:'epilepsy_binary_1', 4:'epilepsy_binary_1', 5:'epilepsy_binary_1'}
else: # keep the original labels
labels = {1:'epilepsy_0', 2:'epilepsy_1', 3:'epilepsy_2', 4:'epilepsy_3', 5:'epilepsy_4'}
targets_train, targets_val, targets_test = targets_train.replace(labels), targets_val.replace(labels), targets_test.replace(labels)
train = pd.concat([targets_train, samples_train], axis=1)
val = pd.concat([targets_val, samples_val], axis=1)
test = pd.concat([targets_test, samples_test], axis=1)
# reset feather columns and drop index
train.columns, val.columns, test.columns = [str(i) for i in range(train.shape[1])], [str(i) for i in range(val.shape[1])], [str(i) for i in range(test.shape[1])]
os.makedirs(save_dir, exist_ok=True)
train.reset_index(drop=True).to_feather(os.path.join(save_dir, 'train.feather'))
val.reset_index(drop=True).to_feather(os.path.join(save_dir, 'val.feather'))
test.reset_index(drop=True).to_feather(os.path.join(save_dir, 'test.feather'))
def load_pt_dataset(data_path):
data = torch.load(data_path)
try:
samples = data['samples'].float() # torch tensor: (n_samples, n_features, length)
except:
samples = torch.Tensor(data['samples']).float()
try:
targets = data['labels'].long() # torch tensor: (n_samples,)
except:
targets = torch.Tensor(data['labels']).long() # torch tensor: (n_samples,)
return samples, targets
def load_feather_dataset(data_path):
data = pd.read_feather(data_path)
classes = sorted(data.iloc[:, 0].unique())
class_to_idx = {c: i for i, c in enumerate(classes)}
data[data.columns[0]] = data[data.columns[0]].map(class_to_idx)
samples = torch.Tensor(data.iloc[:, 1:].values).float()
targets = torch.Tensor(data.iloc[:, 0].values).long()
if samples.ndim == 2:
samples.unsqueeze_(1)
return samples, targets
def mergeDatasets(dataset_dir_dict: dict, save_dir: str, grain=10000):
"""merge provided datasets with original split of train test set.
Parameters:
dataset_dir_dict (dict): dictionery of the directory of source sub-datasets
save_dir (str): save directory of the merged dataset
grain (int): grain size of splitting the huge merged dataset into multiple files
"""
# get sub-dataset dir list
UEA_DIR_NAME = 'uea-30'
UEA_DEL_DATASETS = ['InsectWingbeat', 'EigenWorms']
# UEA_DEL_DATASETS = []
MAX_LENGTH_LIMIT = 2048
DATASETS = []
merge_dir, merge_shuffle_dir = os.path.join(save_dir, 'all-merged-ori'), os.path.join(save_dir, 'all-merged')
os.makedirs(merge_dir, exist_ok=True)
os.makedirs(merge_shuffle_dir, exist_ok=True)
for directory, dataset_type in dataset_dir_dict.items():
if dataset_type == 'single':
DATASETS.append(directory)
else:
for d in os.listdir(directory):
if directory.split('/')[-1] == UEA_DIR_NAME and d in UEA_DEL_DATASETS:
continue
DATASETS.append(os.path.join(directory, d))
# load sub-datasets
merged_df = {}
df_data = {}
df_no_duplicate_data = []
for t in ['train', 'val', 'test']:
df_data[t] = []
for d in tqdm(sorted(DATASETS)):
archieve_name = d.split("/")[-2].split("-")[0]
dataset_name = d.split("/")[-1]
uniq_name = archieve_name + "-" + dataset_name
data_dict = {}
df_data_dict = {}
val_set_exist = True
for t in ['train', 'val', 'test']:
# load data
try:
try:
samples, targets = load_pt_dataset(os.path.join(d, f'{t}.pt'))
except:
samples, targets = load_feather_dataset(os.path.join(d, f'{t}.feather'))
except:
# load test set for dataset without val set
val_set_exist = False
try:
samples, targets = load_pt_dataset(os.path.join(d, f'test.pt'))
except:
samples, targets = load_feather_dataset(os.path.join(d, f'test.feather'))
# split multi-variate sample to uni-variate samples
N, C, L = samples.shape
samples = samples.reshape(N*C, L)
if L > MAX_LENGTH_LIMIT:
samples = samples[:, :L]
targets = np.repeat(targets, C, axis=0)
data_dict[t] = {'samples': pd.DataFrame(samples.numpy()), 'targets': pd.DataFrame(targets.numpy())}
# # remove large single datasets and long sequence
# if L > MAX_LENGTH_LIMIT:
# continue
# reindex classes
ori_idx = data_dict['train']['targets'].iloc[:, 0].unique()
new_idx = {sorted(ori_idx)[i]: f'{uniq_name}_{i:02d}' for i in range(len(ori_idx))}
# concat target and feature
for t in ['train', 'val', 'test']:
data_dict[t]['targets'].iloc[:, 0] = data_dict[t]['targets'].iloc[:, 0].map(new_idx)
df_data_dict[t] = pd.concat([data_dict[t]['targets'], data_dict[t]['samples']], axis=1)
df_data_dict[t].columns = [str(i) for i in range(df_data_dict[t].shape[1])]
df_data[t].append(df_data_dict[t])
if t != 'val' or val_set_exist:
df_no_duplicate_data.append(df_data_dict[t])
print(f'Finish process dataset:{uniq_name} '
f'with train/val/test size: {df_data_dict["train"].shape[0]}/{df_data_dict["val"].shape[0]}/{df_data_dict["test"].shape[0]}, '
f'features num: {C}, '
f'class num: {len(new_idx)}, '
f'length: {df_data_dict["train"].shape[1]}')
# merge all datasets
# for t in ['train', 'val', 'test']:
for t in ['train']:
merged_df[t] = pd.concat(df_data[t], axis=0) # sort=True will reorder the sequence
merged_df[t].columns = [str(i) for i in range(merged_df[t].shape[1])]
print(f'Saving processed data: '
f'{len(merged_df["train"])} samples in train set, '
# f'{len(merged_df["val"])} samples in val set, '
# f'{len(merged_df["test"])} samples in test set, '
f'max_length: {merged_df["train"].shape[1]}, '
f'total_classes: {len(merged_df["train"].iloc[:, 0].unique())}')
# save original split data
# for t in ['train', 'val', 'test']:
for t in ['train']:
for i in tqdm(range(merged_df[t].shape[0]//grain + 1)):
end = min((i+1)*grain, merged_df[t].shape[0])
merged_df[t].iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_dir, f'{t}_{i}.feather'))
print(f'Start save whole {t} merged data')
merged_df[t].reset_index(drop=True).to_feather(os.path.join(merge_dir, f'{t}.feather'))
# save shuffled data
# shuffled_df = {}
# test_ratio = 0.3
# print("Start merge data")
# no_duplicate_merged_data = pd.concat(df_no_duplicate_data, axis=0)
# no_duplicate_merged_data.columns = [str(i) for i in range(no_duplicate_merged_data.shape[1])]
# print(f"Start shuffle merged data, "
# f"total_num: {no_duplicate_merged_data.shape[0]}, "
# f"classes: {len(no_duplicate_merged_data.iloc[:, 0].unique())}")
# # shuffled_df['train'] = no_duplicate_merged_data.sample(frac=1.0, random_state=42)
# shuffled_df['train'] = no_duplicate_merged_data
# shuffled_df['test'] = no_duplicate_merged_data.sample(frac=test_ratio, random_state=42)
# print(f'Saving shuffled data: '
# f'{len(shuffled_df["train"])} samples in train set, '
# f'{len(shuffled_df["test"])} samples in test set, '
# f'train total classes: {len(shuffled_df["train"].iloc[:, 0].unique())}, '
# f'test total classes: {len(shuffled_df["test"].iloc[:, 0].unique())}')
# for t in ['train', 'test']:
# for i in tqdm(range(shuffled_df[t].shape[0]//grain + 1)):
# end = min((i+1)*grain, shuffled_df[t].shape[0])
# shuffled_df[t].iloc[i*grain:end].reset_index(drop=True).to_feather(os.path.join(merge_shuffle_dir, f'{t}_{i}.feather'))
# print(f'Start save whole {t} shuffle merged data')
# shuffled_df[t].reset_index(drop=True).to_feather(os.path.join(merge_shuffle_dir, f'{t}.feather'))
def prepareAnomalyDataset(data_dir, save_dir, seed=42, normal_num=900, outliers_num=100, test_ratio=0.3):
samples, targets = load_pt_dataset(os.path.join(data_dir, 'test.pt'))
undamaged_samples = samples[targets == 0]
damaged_samples = samples[targets != 0]
# sample normal
perm1 = torch.randperm(undamaged_samples.shape[0])
normal = undamaged_samples[perm1[:normal_num]]
# sample outliers
perm2 = torch.randperm(damaged_samples.shape[0])
outliers = damaged_samples[perm2[:outliers_num]]
# print(undamaged_samples.shape, damaged_samples.shape, normal.shape, outliers.shape)
# shuffle
new_samples = torch.cat((normal, outliers), 0)
new_targets = torch.cat((torch.zeros(normal_num), torch.ones(outliers_num)), 0)
perm = torch.randperm(new_samples.shape[0])
new_samples = new_samples[perm]
new_targets = new_targets[perm]
# split
train_num = int((normal_num + outliers_num) * (1 - test_ratio))
train_samples = new_samples[:train_num]
train_targets = new_targets[:train_num]
test_samples = new_samples[train_num:]
test_targets = new_targets[train_num:]
if not os.path.exists(save_dir):
os.makedirs(save_dir)
obj = {'samples': train_samples, 'labels': train_targets}
torch.save(obj, f'{save_dir}/train.pt')
obj = {'samples': test_samples, 'labels': test_targets}
torch.save(obj, f'{save_dir}/test.pt')
if __name__ == "__main__":
# prepareEpilepsy("./datasets/time_series/epilepsy_origin")
# prepareUCR(dataset_dir="./datasets/time_series/UCRArchive_2018",
# fill_missing_and_variable=True, normalize=False, test_ratio=0.3, grain=10000)
# prepareUEA(dataset_dir="./datasets/time_series/Multivariate_arff")
# prepareSleepEDF(dataset_dir="./datasets/time_series/physionet.org/files/sleep-edfx/1.0.0")
# merge all datasets
# dataset_dir_dict = {
# './datasets/time_series/uea-30': 'multi',
# './datasets/time_series/ucr-128': 'multi',
# './datasets/time_series/TFC-datasets': 'multi',
# }
# save_dir = './datasets/time_series/'
# mergeDatasets(dataset_dir_dict, save_dir, grain=100000)
# construct anomaly detection for FD-B
for s in range(1, 6):
prepareAnomalyDataset(
data_dir="./datasets/time_series/TFC-datasets/FD-B",
save_dir=f"./datasets/time_series/AnomalyDetection/FD-B/s{s}",
seed=s)