-
Notifications
You must be signed in to change notification settings - Fork 1
/
se4csai_group69.py
180 lines (121 loc) · 6.47 KB
/
se4csai_group69.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""SE4CSAI_Group69.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Zpc9CmQ7scPk43FwA0ZarcxcbxlyLmvs
"""
# Commented out IPython magic to ensure Python compatibility.
##ERG BELANGRIJK!!! EERST DEZE RUNNEN!!!
import os
from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SE4CSAI
print(os.getcwd())
#Importing the modules
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import wave
import math, random
from IPython.display import Audio
from scipy.io import wavfile
from matplotlib import pyplot as plt
from google.colab import drive
#!wget https://archive.org/download/warblrb10k_public/warblrb10k_public_wav.zip
#!wget https://surfdrive.surf.nl/files/index.php/s/kQBIBvdqRu1d9Y8/download
"""Here starts the ETL Pipeline, this is the data extraction part
Making the links to the required datasets: (we moeten de filenamen nog even aanpassen)
"""
# maak eerst een shortcut van gedeelde documenten naar "My Drive", dan werkt dit stukje code pas naar behoren.
# hieronder heb ik de path gezet die het zou moeten zijn dan.
# drive.mount('/content/drive', force_remount= True)
bird_detection_audiopath = "/content/drive/MyDrive/SE4CSAI/bird_audio"
os.chdir(bird_detection_audiopath)
bird_detection_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/warblrb10k_public_metadata_2018.csv")
#bird_detection_labels.set_index("itemid", inplace = True)
bird_detection_labels
bird_detection_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/warblrb10k_public_metadata_2018.csv")
#bird_detection_labels.set_index("itemid", inplace = True) # here we make a dictionary to get the labels from the list in a computationally faster way.
bird_detection_labels
bird_detection_label_dictionary = {}
for x in bird_detection_labels.iterrows():
bird_detection_label_dictionary[x[1][0]] = x[1][2]
print(bird_detection_label_dictionary)
bird_species_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/xcmeta.csv", on_bad_lines='skip', sep = "\t")
#bird_detection_labels.set_index("itemid", inplace = True) # we doen en
bird_species_labels
bird_species_label_dictionary = {}
for x in bird_species_labels.iterrows():
bird_species_label_dictionary[str(x[1][0])] = x[1][3] #we place the file name with the species name into a dictionary
print(bird_species_label_dictionary)
bird_species_label_dictionary.get("132608")
# even ter illustratie dat de code daadwerkelijk gelezen wordt, hiernaa heb ik een loop gemaakt zodat
# alle files gelezen worden. het is dan aan ons om daar now wat verder mee te doen.
u, wav = wavfile.read('/content/drive/MyDrive/SE4CSAI/bird_audio/0a0b783d-f9a3-4652-a01d.wav')
print(wav)
x, sr = librosa.load('/content/drive/MyDrive/SE4CSAI/bird_audio/0a0b783d-f9a3-4652-a01d.wav')
plt.plot(wav)
X = librosa.stft(x)
print(X)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()
# we moeten ff kijken hoe we dit in de dataset of neural net willen gooien (nog best belangrijk)
#Ik krijg namelijk alleen imaginary numbers gestores in een matric, maar idk of dat goed is, het duurt ook best lang om dat allemaal te storen
"""Combining audio data with label data of bird detection.:
zet de bird detection audio in een dataset en de bird species audio
:
"""
#hier dus de loop; Duurde 11 minuten voor mij.
bird_classification_dataset = pd.DataFrame(columns = ["Filename", "Data", "IsBird"]) #empty database
bird_species_dataset = pd.DataFrame(columns = ["Filename", "Data", "gen"]) #empty database
class_counter = 0
species_counter = 0
for file in os.listdir(bird_detection_audiopath): #this part of the loop stores the classification audio and labels from the datasets into one.
audiofile = "/content/drive/MyDrive/SE4CSAI/bird_audio/" + file
x, sr = librosa.load(audiofile) # we use libriosa.load to also immediately normailze the data.
bird_classification_dataset.loc[class_counter, ['Filename']] = file
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
bird_classification_dataset.loc[class_counter, ['Data']] = [Xdb]
bird_classification_dataset.loc[class_counter, ['IsBird']] = bird_detection_label_dictionary.get(file[:-4])
class_counter += 1
bird_species_audiopath = "/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/flac/"
os.chdir(bird_species_audiopath)
for file in os.listdir(bird_species_audiopath):
audiofile = "/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/flac/" + file
x, sr = librosa.load(audiofile)
bird_audio_dataset.loc[class_counter, ['Filename']] = file
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
bird_classification_dataset.loc[class_counter, ['Data']] = [Xdb] #this part of the loop stores all the bird species also in the classification to increase the dataset size
bird_audio_dataset.loc[class_counter, ['IsBird']] = 1
class_counter += 1
bird_species_dataset.loc[species_counter, ['Filename']] = file[2:-5]
bird_species_dataset.loc[species_counter, ['Data']] = [x] #this part of the loop stores the ausiodata of bird species in a seperate dataset, it also gets the species\gen etc as a label.
bird_species_dataset.loc[species_counter, ['gen']] = bird_species_label_dictionary.get(file[2:-5])
species_counter += 1
"""Hieronder is ter demonstrtatie"""
bird_species_dataset # prints dataset
bird_classification_dataset
"""Uitrekenen hoeveel wel een bird hebben en hoeveel niet."""
birdcounter = 0
for isbirdvalue in bird_audio_dataset['IsBird']:
if isbirdvalue == 1:
birdcounter += 1
print("In these 8003 audio clips,",birdcounter,"really contain bird sounds.")
"""Splitting all the data and oversampling """
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
X = bird_audio_dataset[["Data"]]
y = bird_audio_dataset[["IsBird"]].astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
ros = RandomOverSampler()
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
y_train_res['IsBird'].value_counts()
bird_audio_dataset.to_csv('/content/drive/MyDrive/SE4CSAI/classification_dataset.csv')
bird_species_dataset.to_csv('/content/drive/MyDrive/SE4CSAI/species_dataset.csv')
"""Nou heren, we hebben als het goed is nu een beetje alle datasets klaar gemaakt, nu moeten we een manier vinden om dit netjes in een CNN of ander algoritme te stoppen."""