-
Notifications
You must be signed in to change notification settings - Fork 0
/
GeneralDataProcessing.py
96 lines (73 loc) · 3.24 KB
/
GeneralDataProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import joblib
from utils import normalize_MPU9250_data
plt.interactive(True)
pd.options.display.max_columns = 15
pic_prefix = 'pic/'
# data_path = 'data/CSV'
data_path = 'Anonimised Data/Data'
player_folders = os.listdir(data_path)
player_folders = [f"{data_path}/{folder}" for folder in player_folders if not folder.startswith('.')]
data_dict = {}
data_sources_list = [
'schairlog',
'gamelog',
'hrm',
'envibox',
'datalog',
'eyetracker',
# 'key',
# 'mkey',
'mxy',
# 'gyro', # Bad quality data
] # List sources for analysis here
# chair_data_columns = ['time', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'mag_x', 'mag_y', 'mag_z']
for player_folder in player_folders:
player_data_dict = {}
player_id = player_folder.split('/')[-1]
player_files = os.listdir(player_folder)
player_files = [file for file in player_files if not file.startswith('.')]
player_data_sources = [file.split('_')[0] for file in player_files] # There are might be repetitions
# print(player_data_sources)
for file, data_source in zip(player_files, player_data_sources):
if data_source not in data_sources_list:
continue
try:
df = pd.read_csv(player_folder + '/' + file)
if data_source in player_data_dict: # If already in dict it's appended
new_df = pd.concat([player_data_dict[data_source], df], axis=0).reset_index(drop=True)
player_data_dict[data_source] = new_df
else:
player_data_dict[data_source] = df
except:
pass
# Sorting by time and fixing naming
for data_source in player_data_dict.keys():
if data_source == 'gamelog':
player_data_dict[data_source].rename(columns={'Unnamed: 0': 'time'}, inplace=True)
if data_source == 'hrm':
mask_fake_data = player_data_dict[data_source]['hrm'] < 45
player_data_dict[data_source] = player_data_dict[data_source].loc[~mask_fake_data, :]
if data_source == 'datalog':
player_data_dict[data_source].drop(columns=['time_host', 'n'], inplace=True)
rename_dict = {
'sensor1': 'hrm2', # Too complex
'sensor2': 'resistance',
'sensor3': 'muscle_activity',
}
player_data_dict[data_source].rename(columns=rename_dict, inplace=True)
if data_source == 'envibox':
player_data_dict[data_source].drop(columns=['time_host', 'n'], inplace=True)
player_data_dict[data_source].sort_values(by='time', inplace=True)
player_data_dict[data_source].reset_index(drop=True, inplace=True)
if data_source == 'eyetracker':
player_data_dict[data_source] = player_data_dict[data_source].iloc[1:, :] # I just wanna drop the first row because
# the data writing was interrupted
player_data_dict[data_source]['time'] = pd.to_datetime(player_data_dict[data_source]['time']).apply(lambda x: x.timestamp())
data_dict[player_id] = player_data_dict
joblib.dump(data_dict, 'data/data_dict')
# plt.close()
# plt.plot(player_data_dict['datalog']['sensor3'].iloc[:10000])