se4csai_group69.py

# -*- coding: utf-8 -*-
"""SE4CSAI_Group69.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Zpc9CmQ7scPk43FwA0ZarcxcbxlyLmvs
"""

# Commented out IPython magic to ensure Python compatibility.
##ERG BELANGRIJK!!! EERST DEZE RUNNEN!!!
import os
from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SE4CSAI
print(os.getcwd())

#Importing the modules
import os

import pandas as pd
import numpy as np
import librosa
import librosa.display
import wave
import math, random
from IPython.display import Audio
from scipy.io import wavfile
from matplotlib import pyplot as plt
from google.colab import drive

#!wget https://archive.org/download/warblrb10k_public/warblrb10k_public_wav.zip
#!wget https://surfdrive.surf.nl/files/index.php/s/kQBIBvdqRu1d9Y8/download

"""Here starts the ETL Pipeline, this is the data extraction part

Making the links to the required datasets:   (we moeten de filenamen nog even  aanpassen)
"""

# maak eerst een shortcut van gedeelde documenten naar "My Drive", dan werkt dit stukje code pas naar behoren.
# hieronder heb ik de path gezet die het zou moeten zijn dan.

# drive.mount('/content/drive', force_remount= True)

bird_detection_audiopath = "/content/drive/MyDrive/SE4CSAI/bird_audio"
os.chdir(bird_detection_audiopath)
bird_detection_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/warblrb10k_public_metadata_2018.csv")

#bird_detection_labels.set_index("itemid", inplace = True)
bird_detection_labels

bird_detection_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/warblrb10k_public_metadata_2018.csv")

#bird_detection_labels.set_index("itemid", inplace = True)      # here we make a dictionary to get the labels from the list in a computationally faster way.
bird_detection_labels
 
bird_detection_label_dictionary = {}
for x in bird_detection_labels.iterrows():

  bird_detection_label_dictionary[x[1][0]] = x[1][2]
 

print(bird_detection_label_dictionary)

bird_species_labels = pd.read_csv("/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/xcmeta.csv", on_bad_lines='skip', sep = "\t")

#bird_detection_labels.set_index("itemid", inplace = True) # we doen en
bird_species_labels

bird_species_label_dictionary = {}
for x in bird_species_labels.iterrows():
 
  bird_species_label_dictionary[str(x[1][0])] = x[1][3]     #we place the file name with the species name into a dictionary
 

print(bird_species_label_dictionary)

bird_species_label_dictionary.get("132608")

# even ter illustratie dat de code daadwerkelijk gelezen wordt, hiernaa heb ik een loop gemaakt zodat 
# alle files gelezen worden. het is dan aan ons om daar now wat verder mee te doen.

u, wav = wavfile.read('/content/drive/MyDrive/SE4CSAI/bird_audio/0a0b783d-f9a3-4652-a01d.wav')

print(wav)
x, sr = librosa.load('/content/drive/MyDrive/SE4CSAI/bird_audio/0a0b783d-f9a3-4652-a01d.wav')

plt.plot(wav)

X = librosa.stft(x)
print(X)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

# we moeten ff kijken hoe we dit in de dataset of neural net willen gooien (nog best belangrijk) 
#Ik krijg namelijk alleen imaginary numbers gestores in een matric, maar idk of dat goed is, het duurt ook best lang om dat allemaal te storen

"""Combining audio data with label data of bird detection.:

zet de bird detection audio in een dataset en de bird species audio
:
"""

#hier dus de loop; Duurde 11 minuten voor mij.
bird_classification_dataset = pd.DataFrame(columns = ["Filename", "Data", "IsBird"]) #empty database

bird_species_dataset = pd.DataFrame(columns = ["Filename", "Data", "gen"]) #empty database

class_counter = 0
species_counter = 0
for file in os.listdir(bird_detection_audiopath):       #this part of the loop stores the classification audio and labels from the datasets into one.
 
  audiofile = "/content/drive/MyDrive/SE4CSAI/bird_audio/" + file
  x, sr = librosa.load(audiofile) # we use libriosa.load to also immediately normailze the data.

  bird_classification_dataset.loc[class_counter, ['Filename']] = file 

  X = librosa.stft(x)
  Xdb = librosa.amplitude_to_db(abs(X))
  bird_classification_dataset.loc[class_counter, ['Data']] = [Xdb]

  bird_classification_dataset.loc[class_counter, ['IsBird']] = bird_detection_label_dictionary.get(file[:-4])
  class_counter += 1

bird_species_audiopath = "/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/flac/"
os.chdir(bird_species_audiopath)
for file in os.listdir(bird_species_audiopath):   
  audiofile = "/content/drive/MyDrive/SE4CSAI/bird_species/ukxcmany/flac/" + file
  x, sr = librosa.load(audiofile)
  bird_audio_dataset.loc[class_counter, ['Filename']] = file
  X = librosa.stft(x)

  Xdb = librosa.amplitude_to_db(abs(X))
  bird_classification_dataset.loc[class_counter, ['Data']] = [Xdb]     #this part of the loop stores all the bird species also in the classification to increase the dataset size
  
  bird_audio_dataset.loc[class_counter, ['IsBird']] = 1
  class_counter += 1

  bird_species_dataset.loc[species_counter, ['Filename']] = file[2:-5]
  bird_species_dataset.loc[species_counter, ['Data']] = [x]       #this part of the loop stores the ausiodata of bird species in a seperate dataset, it also gets the species\gen etc as a label.
  bird_species_dataset.loc[species_counter, ['gen']] =  bird_species_label_dictionary.get(file[2:-5])
  species_counter += 1

"""Hieronder is ter demonstrtatie"""

bird_species_dataset # prints dataset

bird_classification_dataset

"""Uitrekenen hoeveel wel een bird hebben en hoeveel niet."""

birdcounter = 0
for isbirdvalue in bird_audio_dataset['IsBird']:
  if isbirdvalue == 1:
    birdcounter += 1
print("In these 8003 audio clips,",birdcounter,"really contain bird sounds.")

"""Splitting all the data and oversampling """

from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split

X = bird_audio_dataset[["Data"]]
y = bird_audio_dataset[["IsBird"]].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ros = RandomOverSampler()

X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

y_train_res['IsBird'].value_counts()

bird_audio_dataset.to_csv('/content/drive/MyDrive/SE4CSAI/classification_dataset.csv')
bird_species_dataset.to_csv('/content/drive/MyDrive/SE4CSAI/species_dataset.csv')

"""Nou heren, we hebben als het goed is nu een beetje alle datasets klaar gemaakt, nu moeten we een manier vinden om dit netjes in een CNN of ander algoritme te stoppen."""