-
Notifications
You must be signed in to change notification settings - Fork 0
/
spotify_utils.py
261 lines (214 loc) · 9.4 KB
/
spotify_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import sys
import spotipy
import spotipy.util as util
import calendar
import dateutil
import json
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from numpy import nan as Nan
#scope = 'user-library-read'
#scope = 'user-top-read user-read-playback-state'
# scope = 'user-read-playback-state user-read-recently-played'
# token = util.prompt_for_user_token('aaronopp', scope, client_id='dbe2a20785304190b8e35d5d6644397b', client_secret='cc259d8378be48beaad9171a5afb19ba', redirect_uri='http://localhost:8888/callback')
# import dateutil.parser as dp
# just realized i should only create/save features at the end when i have all ids. then i
# only execute it when i need it!
def spotify_pipeline(token, scope, time_test_scaled, filename, data_gen_run_time=5, save=True):
#if not os.path.exists(filename):
#print 'data folder created'
#os.makedirs(filename)
print('\n \n \n \n \n run time min 2:', data_gen_run_time)
spotify_data_df = create_spotify_df(token, scope, time_test_scaled, num_songs=data_gen_run_time)
if save:
spotify_data_df.to_csv(filename + '_raw.csv')
print(spotify_data_df.index[1])
print(time_test_scaled[1:4])
print(spotify_data_df.index[1]- time_test_scaled[1])
spotify_dataset = create_spotify_dataset(spotify_data_df, time_test_scaled)
if save:
spotify_dataset.to_csv(filename + '.csv')
return spotify_data_df
def save_spotify_df(token, scope, filename):
spotify_data_df = create_spotify_df(token, scope)
spotify_data_df.to_csv(filename)
def get_api_history_song_number(num):
if num > 30:
return num/2
if num > 200:
print('\n \n can only take last 100 songs! \n \n')
return 100
def create_spotify_df(token, scope, get_genre=True, save_features=False, num_songs=5):
print('\n \n \n \n \n run time min 3:', num_songs)
timestamp = []
songs = []
artists = []
popularities = []
ids = []
artist_ids = []
genres = []
durations = []
if token:
sp = spotipy.Spotify(auth=token)
sp.trace=False
ranges = ['short_term', 'medium_term', 'long_term']
#esults = sp.current_user_playing_track()
try:
curr_results = sp.current_playback()
except AttributeError as err:
print (err)
curr_results = None
#print 'current playback keys ', curr_results.keys()
#print results['item']
#print 'user playing track keys', results.keys()
if curr_results != None:
print('song name: ', curr_results['item']['name'])
print('timestamp: ', curr_results['timestamp'])
curr_artist_info = curr_results['item']['artists']
print('artist name: ', curr_artist_info[0]['name'])
print('artist id: ', curr_artist_info[0]['id'])
#print curr_results['timestamp']
#print 'type name: ', curr_results['item']['type']
print('song id: ', curr_results['item']['id'])
print('\n starting spotify data with current playing song! \n')
timestamp.append(int(str(curr_results['timestamp'])))
songs.append(curr_results['item']['name'])
ids.append(curr_results['item']['id'])
artists.append(curr_artist_info[0]['name'])
artist_ids.append(curr_artist_info[0]['id'])
popularities.append(curr_results['item']['popularity'])
#if get_genre == True:
genres.append(get_genre_from_track_id(sp, curr_results['item']['id']))
durations.append(curr_results['progress_ms'])
else:
print('\n no track currently playing \n')
recent_results = sp.current_user_recently_played(limit=num_songs)
#print recent_results.keys()
#print recent_results['href']
#print recent_results['cursors']
#print recent_results['next']
for i, item in enumerate(recent_results['items']):
print('name: ', item['track']['name'])
print('id: ', item['track']['id'])
artist_info = item['track']['artists']
print('artist: ', artist_info[0]['name'])
print('duration ms: ', item['track']['duration_ms'])
print('popularity: ', item['track']['popularity'])
print('played: ', item['played_at'])
print('type', type(item['played_at']))
timestamp.append(item['played_at'])
songs.append(item['track']['name'])
ids.append(item['track']['id'])
artists.append(artist_info[0]['name'])
artist_ids.append(artist_info[0]['id'])
popularities.append(item['track']['popularity'])
#if get_genre == True:
genres.append(get_genre_from_track_id(sp, item['track']['id']))
durations.append(item['track']['duration_ms'])
#get timestamps
utc_timestamps = []
for index, time in enumerate(timestamp):
if index != 0:
utc_timestamps.append(calendar.timegm(dateutil.parser.parse(time).timetuple())*1000)
else:
utc_timestamps.append(time)
print(utc_timestamps)
# create dataframe of basic data.
spotify_data = {'timestamp': utc_timestamps, 'artist': artists, 'artist_id': artist_ids, 'song': songs, 'id': ids, 'genre': genres, 'popularity': popularities, 'duration': durations}
spotify_data_df = pd.DataFrame(spotify_data)
spotify_data_df = spotify_data_df.set_index('timestamp')
if save_features == True:
try:
save_spotify_features(sp, ids)
print('features saved!')
except:
print('unable to save features')
return spotify_data_df
def save_spotify_features(sp, ids):
# get audio features for each track and save as a JSON!
features = sp.audio_features(ids)
features_json = []
for feature in features:
#print feature
print((json.dumps(feature, indent=4)))
print()
features_json.append(json.dumps(feature, indent=4))
with open('data.json', 'a') as outfile:
json.dump(feature, outfile, indent=4)
#analysis = sp._get(feature['analysis_url'])
#print(json.dumps(analysis, indent=4))
#print()
def get_album_id_from_track(sp, id):
track = sp.track(id)
#pprint.pprint(track)
album_id = track['album']['id']
print(album_id)
return album_id
def get_genre_from_track_id(sp, id):
track = sp.track(id)
#pprint.pprint(track)
album_id = track['album']['id']
album = sp.album(str(album_id))
pprint.pprint(album['genres'])
return album['genres']
def get_audio_features_json(sp, dataframe, savefile):
# get audio features for each track and save as a JSON!
ids = spotify_data['id'].tolist()
features = sp.audio_features(ids)
features_json = []
for feature in features:
#print feature
print((json.dumps(feature, indent=4)))
print()
features_json.append(json.dumps(feature, indent=4))
with open(savefile, 'a') as outfile:
json.dump(feature, outfile, indent=4)
# OPTIONAL song analysis (too much irrelavant data ATM.)
#analysis = sp._get(feature['analysis_url'])
#print(json.dumps(analysis, indent=4))
#print()
####################################################
# Functions to match song data with timestamp data
#
####################################################
def match_spotify_dataset(dataframe, time_test_scaled):
j = 0
Y_logger_df = pd.DataFrame(data=None, columns=dataframe.columns)
for index_logger, timestamps in enumerate(time_test_scaled):
for index, value in enumerate(list(dataframe.index)):
if value < timestamps:
print(value-timestamps)
print(index)
print('\n timestamps - value: ', timestamps - value)
print('\n dataframe duration: ', dataframe.iloc[0]['duration'])
print(dataframe.iloc[index])
if timestamps - value > dataframe.iloc[0]['duration']:
print('\n \n \n song is not actually playing')
spotify_series = pd.Series([Nan, Nan, Nan, Nan, Nan, Nan, Nan],
index=['artist', 'artist_id', 'duration', 'genre', 'id',
'popularity', 'song'], name=value)
Y_logger_df = Y_logger_df.append(spotify_series)
break
Y_logger_df = Y_logger_df.append(dataframe.iloc[index])
j += 1
break
print('j:' , j)
if j == 0:
print('\n \n NO MATCHES - please extend your recent playback! \n \n')
print('terminating script')
sys.exit(1)
return Y_logger_df
def create_spotify_dataset(dataframe, time_test_scaled, save_df=False):
y_df = match_spotify_dataset(dataframe, time_test_scaled)
y_df['timestamps'] = time_test_scaled
y_df = y_df.reset_index()
y_df = y_df.rename(columns={'index': 'song_start_ts'})
y_df = y_df.set_index('timestamps')
if save_df == True:
y_df.to_csv('spotify_dataset.csv')
return y_df
# to write- get all IDs from big pandas dataframe and then save spotify features
# save spotify data df as dataframe? or just sync it w training data.