xiaozhouwang · kweonwooj · Apr 6, 2018 · Apr 6, 2018 · Apr 7, 2018 · Apr 11, 2018
diff --git a/data.py b/data.py
@@ -25,10 +25,11 @@ def __init__(self, mode, label_words_dict, wav_list, add_noise, preprocess_fun,
         """
         self.mode = mode
         self.label_words_dict = label_words_dict
-        self.wav_list = wav_list
+        self.wav_list = wav_list[0]
+        self.label_list = wav_list[1]
         self.add_noise = add_noise
         self.sr = sr
-        self.n_silence = int(len(wav_list) * 0.09)
+        self.n_silence = int(len(self.wav_list) * 0.09)
         self.preprocess_fun = preprocess_fun
         self.preprocess_param = preprocess_param
 
@@ -100,8 +101,7 @@ def __getitem__(self, idx):
             if self.mode == 'test':
                 return {'spec': wav_tensor, 'id': self.wav_list[idx]}
 
-            label = self.label_words_dict[self.wav_list[idx].split("/")[-2]] if self.wav_list[idx].split(
-                "/")[-2] in self.label_words_dict else len(self.label_words_dict)
+            label = self.label_words_dict.get(self.label_list[idx], len(self.label_words_dict))
 
             return {'spec': wav_tensor, 'id': self.wav_list[idx], 'label': label}
 
@@ -132,33 +132,47 @@ def get_wav_list(words, unknown_ratio=0.2):
 
     # sample full train list
     sampled_train_list = []
+    sampled_train_labels = []
     for w in full_train_list:
         l = w.split("/")[-2]
         if l not in words:
             if random.random() < unknown_ratio:
                 sampled_train_list.append(w)
+                sample_train_labels.append('unknown')
         else:
             sampled_train_list.append(w)
+            sampled_train_labels.append(l)
 
-    return sampled_train_list, full_test_list
+    return sampled_train_list, sampled_train_labels, full_test_list
 
 
 def get_sub_list(num, sub_path):
     lst = []
     df = pd.read_csv(sub_path)
     words = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']
     each_num = int(num * 0.085)
+    labels = []
     for w in words:
         tmp = df['fname'][df['label'] == w].sample(each_num).tolist()
         lst += ["../input/test/audio/" + x for x in tmp]
-    return lst
+        for _ in range(len(tmp)):
+            labels.append(w)
+    return lst, labels
 
 
 def get_semi_list(words, sub_path, unknown_ratio=0.2, test_ratio=0.2):
-    train_list, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio)
-    test_list = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path)
-    lst = train_list + test_list
-    return sample(lst, len(lst))
+    train_list, train_labels, _ = get_wav_list(words=words, unknown_ratio=unknown_ratio)
+    test_list, test_labels = get_sub_list(num=int(len(train_list) * test_ratio), sub_path=sub_path)
+    file_list = train_list + test_list
+    label_list = train_labels + test_labels
+    assert(len(file_list) == len(label_list))
+
+    random.seed(2018)
+    file_list = sample(file_list, len(file_list))
+    random.seed(2018)
+    label_list = sample(label_list, len(label_list))
+
+    return file_list, label_list
 
 
 def preprocess_mfcc(wave):
@@ -189,4 +203,4 @@ def preprocess_wav(wav, normalization=True):
     if normalization:
         mean = data.mean()
         data -= mean
-    return data
+    return data
diff --git a/trainer.py b/trainer.py
@@ -65,18 +65,18 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained):
             optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, speechmodel.parameters()), lr=learning_rate, momentum=0.9, weight_decay=0.00001)
             speechmodel.train()
             if semi_train_path:
-                train_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path,
+                train_list, label_list = get_semi_list(words=label_to_int.keys(), sub_path=semi_train_path,
                                            test_ratio=choice([0.2, 0.25, 0.3, 0.35]))
                 print("semi training list length: ", len(train_list))
             else:
-                train_list, _ = get_wav_list(words=label_to_int.keys())
+                train_list, label_list, _ = get_wav_list(words=label_to_int.keys())
 
             if pretraining:
                 traindataset = PreDataset(label_words_dict=label_to_int,
                                           add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                           resize_shape=reshape_size, is_1d=is_1d)
             else:
-                traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=train_list,
+                traindataset = SpeechDataset(mode='train', label_words_dict=label_to_int, wav_list=(train_list, label_list),
                                              add_noise=True, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                              resize_shape=reshape_size, is_1d=is_1d)
             trainloader = DataLoader(traindataset, BATCH_SIZE, shuffle=True)
@@ -108,8 +108,8 @@ def get_model(model=model_class, m=MGPU, pretrained=pretrained):
         trained_models = ["model/model_%s_%s.pth" % (CODER, b) for b in range(bagging_num)]
 
         # prediction
-        _, test_list = get_wav_list(words=label_to_int.keys())
-        testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=test_list,
+        _, _, test_list = get_wav_list(words=label_to_int.keys())
+        testdataset = SpeechDataset(mode='test', label_words_dict=label_to_int, wav_list=(test_list, []),
                                     add_noise=False, preprocess_fun=preprocess_fun, preprocess_param=preprocess_param,
                                     resize_shape=reshape_size, is_1d=is_1d)
         testloader = DataLoader(testdataset, BATCH_SIZE, shuffle=False)