forked from x4nth055/emotion-recognition-using-speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_extractor.py
249 lines (230 loc) · 11.1 KB
/
data_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import pandas as pd
import pickle
import tqdm
import os
from utils import get_label, extract_feature, get_first_letters
from collections import defaultdict
class AudioExtractor:
"""A class that is used to featurize audio clips, and provide
them to the machine learning algorithms for training and testing"""
def __init__(self, audio_config=None, verbose=1, features_folder_name="features", classification=True,
emotions=['sad', 'neutral', 'happy'], balance=True):
"""
Params:
audio_config (dict): the dictionary that indicates what features to extract from the audio file,
default is {'mfcc': True, 'chroma': True, 'mel': True, 'contrast': False, 'tonnetz': False}
(i.e mfcc, chroma and mel)
verbose (bool/int): verbosity level, 0 for silence, 1 for info, default is 1
features_folder_name (str): the folder to store output features extracted, default is "features".
classification (bool): whether it is a classification or regression, default is True (i.e classification)
emotions (list): list of emotions to be extracted, default is ['sad', 'neutral', 'happy']
balance (bool): whether to balance dataset (both training and testing), default is True
"""
self.audio_config = audio_config if audio_config else {'mfcc': True, 'chroma': True, 'mel': True, 'contrast': False, 'tonnetz': False}
self.verbose = verbose
self.features_folder_name = features_folder_name
self.classification = classification
self.emotions = emotions
self.balance = balance
# input dimension
self.input_dimension = None
def _load_data(self, desc_files, partition, shuffle):
self.load_metadata_from_desc_file(desc_files, partition)
# balancing the datasets ( both training or testing )
if partition == "train" and self.balance:
self.balance_training_data()
elif partition == "test" and self.balance:
self.balance_testing_data()
else:
if self.balance:
raise TypeError("Invalid partition, must be either train/test")
if shuffle:
self.shuffle_data_by_partition(partition)
def load_train_data(self, desc_files=["train_speech.csv"], shuffle=False):
"""Loads training data from the metadata files `desc_files`"""
self._load_data(desc_files, "train", shuffle)
def load_test_data(self, desc_files=["test_speech.csv"], shuffle=False):
"""Loads testing data from the metadata files `desc_files`"""
self._load_data(desc_files, "test", shuffle)
def shuffle_data_by_partition(self, partition):
if partition == "train":
self.train_audio_paths, self.train_emotions, self.train_features = shuffle_data(self.train_audio_paths,
self.train_emotions, self.train_features)
elif partition == "test":
self.test_audio_paths, self.test_emotions, self.test_features = shuffle_data(self.test_audio_paths,
self.test_emotions, self.test_features)
else:
raise TypeError("Invalid partition, must be either train/test")
def load_metadata_from_desc_file(self, desc_files, partition):
"""Read metadata from a CSV file & Extract and loads features of audio files
Params:
desc_files (list): list of description files (csv files) to read from
partition (str): whether is "train" or "test"
"""
# empty dataframe
df = pd.DataFrame({'path': [], 'emotion': []})
for desc_file in desc_files:
# concat dataframes
df = pd.concat((df, pd.read_csv(desc_file)), sort=False)
if self.verbose:
print("[*] Loading audio file paths and its corresponding labels...")
# get columns
audio_paths, emotions = list(df['path']), list(df['emotion'])
# if not classification, convert emotions to numbers
if not self.classification:
# so naive and need to be implemented
# in a better way
if len(self.emotions) == 3:
self.categories = {'sad': 1, 'neutral': 2, 'happy': 3}
elif len(self.emotions) == 5:
self.categories = {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5}
else:
raise TypeError("Regression is only for either ['sad', 'neutral', 'happy'] or ['angry', 'sad', 'neutral', 'ps', 'happy']")
emotions = [ self.categories[e] for e in emotions ]
# make features folder if does not exist
if not os.path.isdir(self.features_folder_name):
os.mkdir(self.features_folder_name)
# get label for features
label = get_label(self.audio_config)
# construct features file name
n_samples = len(audio_paths)
first_letters = get_first_letters(self.emotions)
name = os.path.join(self.features_folder_name, f"{partition}_{label}_{first_letters}_{n_samples}.npy")
if os.path.isfile(name):
# if file already exists, just load then
if self.verbose:
print("[+] Feature file already exists, loading...")
features = np.load(name)
else:
# file does not exist, extract those features and dump them into the file
features = []
append = features.append
for audio_file in tqdm.tqdm(audio_paths, f"Extracting features for {partition}"):
feature = extract_feature(audio_file, **self.audio_config)
if self.input_dimension is None:
self.input_dimension = feature.shape[0]
append(feature)
# convert to numpy array
features = np.array(features)
# save it
np.save(name, features)
if partition == "train":
try:
self.train_audio_paths
except AttributeError:
self.train_audio_paths = audio_paths
self.train_emotions = emotions
self.train_features = features
else:
if self.verbose:
print("[*] Adding additional training samples")
self.train_audio_paths += audio_paths
self.train_emotions += emotions
self.train_features = np.vstack((self.train_features, features))
elif partition == "test":
try:
self.test_audio_paths
except AttributeError:
self.test_audio_paths = audio_paths
self.test_emotions = emotions
self.test_features = features
else:
if self.verbose:
print("[*] Adding additional testing samples")
self.test_audio_paths += audio_paths
self.test_emotions += emotions
self.test_features = np.vstack((self.test_features, features))
else:
raise TypeError("Invalid partition, must be either train/test")
def _balance_data(self, partition):
if partition == "train":
emotions = self.train_emotions
features = self.train_features
audio_paths = self.train_audio_paths
elif partition == "test":
emotions = self.test_emotions
features = self.test_features
audio_paths = self.test_audio_paths
else:
raise TypeError("Invalid partition, must be either train/test")
count = []
if self.classification:
for emotion in self.emotions:
count.append(len([ e for e in emotions if e == emotion]))
else:
# regression, take actual numbers, not label emotion
for emotion in self.categories.values():
count.append(len([ e for e in emotions if e == emotion]))
# get the minimum data samples to balance to
minimum = min(count)
if minimum == 0:
# won't balance, otherwise 0 samples will be loaded
print("[!] One class has 0 samples, setting balance to False")
self.balance = False
return
if self.verbose:
print("[*] Balancing the dataset to the minimum value:", minimum)
d = defaultdict(list)
if self.classification:
counter = {e: 0 for e in self.emotions }
else:
counter = { e: 0 for e in self.categories.values() }
for emotion, feature, audio_path in zip(emotions, features, audio_paths):
if counter[emotion] >= minimum:
# minimum value exceeded
continue
counter[emotion] += 1
d[emotion].append((feature, audio_path))
emotions, features, audio_paths = [], [], []
for emotion, features_audio_paths in d.items():
for feature, audio_path in features_audio_paths:
emotions.append(emotion)
features.append(feature)
audio_paths.append(audio_path)
if partition == "train":
self.train_emotions = emotions
self.train_features = features
self.train_audio_paths = audio_paths
elif partition == "test":
self.test_emotions = emotions
self.test_features = features
self.test_audio_paths = audio_paths
else:
raise TypeError("Invalid partition, must be either train/test")
def balance_training_data(self):
self._balance_data("train")
def balance_testing_data(self):
self._balance_data("test")
def shuffle_data(audio_paths, emotions, features):
""" Shuffle the data (called after making a complete pass through
training or validation data during the training process)
Params:
audio_paths (list): Paths to audio clips
emotions (list): Emotions in each audio clip
features (list): features audio clips
"""
p = np.random.permutation(len(audio_paths))
audio_paths = [audio_paths[i] for i in p]
emotions = [emotions[i] for i in p]
features = [features[i] for i in p]
return audio_paths, emotions, features
def load_data(train_desc_files, test_desc_files, audio_config=None, classification=True, shuffle=True,
balance=True, emotions=['sad', 'neutral', 'happy']):
# instantiate the class
audiogen = AudioExtractor(audio_config=audio_config, classification=classification, emotions=emotions,
balance=balance, verbose=0)
# Loads training data
audiogen.load_train_data(train_desc_files, shuffle=shuffle)
# Loads testing data
audiogen.load_test_data(test_desc_files, shuffle=shuffle)
# X_train, X_test, y_train, y_test
return {
"X_train": np.array(audiogen.train_features),
"X_test": np.array(audiogen.test_features),
"y_train": np.array(audiogen.train_emotions),
"y_test": np.array(audiogen.test_emotions),
"train_audio_paths": audiogen.train_audio_paths,
"test_audio_paths": audiogen.test_audio_paths,
"balance": audiogen.balance,
}