#!/usr/bin/env python # coding: utf-8 # This notebook contains some code snippets I used in writing the following blog post: https://dev.to/ericbonfadini/finding-my-new-favorite-song-on-spotify-4lgc # While I'm developing (or just while I'm commuting to work) I usually love to hear some rock music. # # I created some playlists on Spotify, but lately I'm stick to the same playlist, containing my favorite "Indie Rock" songs. # This playlist is made up of more or less 45 songs I discovered through the years in several ways. # # Since I was starting to get bored about always listening to the same songs, last weekend I decided to analyze my playlist using Spotify APIs in order to discover insights and hopefully to find some new tunes I could add. # # Here's what I did in more or less 300 lines of Python 3 code (boilerplate included). # # Setting up the environment # For my analysis I set up a Python 3 virtual environment with the following libraries: # - Pandas for data analysis # - Seaborn for data visualization # - Spotipy for interaction with Spotify APIs # # In order to access the Spotify APIs I registered my app and then I provided the spotipy library with the client_id, the client_secret and a redirect url. # In[ ]: import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import spotipy from matplotlib import style from spotipy import util # Improve Pandas display settings pd.set_option("display.width", 600) pd.set_option("display.max_columns", 50) # Change Seaborn default settings sns.set_context('talk') style.use('ggplot') # Spotify API limit API_LIMIT = 50 # # Analyzing my playlist tracks # After a first API call to get my playlist id, I got all the tracks of my playlist with another API call along with some basic information like: song id, song name, artist id, artist name, album name, song popularity. # # With another API call I got some extra information about the artists in my playlist, like genres and artist_popularity. # # Finally with another API call I got some insightful information about my tracks: # - duration_ms: the duration of the track in milliseconds; # - acousticness: describes the acousticness of a song (1 => high confidence the track is acoustic). It ranges from 0 to 1; # - danceability: describes the danceability of a song (1 => high confidence the track is danceable). It ranges from 0 to 1; # - energy: it's a perceptual measure of intensity and activity (e.g. death metal has high energy while classical music has low energy). It ranges from 0 to 1; # - instrumentalness: predicts whether a track contains no vocals (1 => high confidence the track has no vocals). It ranges from 0 to 1; # - liveness: detects the presence of an audience in the recording (1 => high confidence the track is live). It ranges from 0 to 1; # - loudness: detects the overall loudness of a track in decibels. It ranges from -60dB to 0dB; # - valence: describes the musical positiveness conveyed by a track (1 => more positive, 0 => more negative). It ranges from 0 to 1; # - speechiness: detects the presence of spoken words in a track (1 => speech, 0 => non speech, just music). It ranges from 0 to 1; # - key: describes the pitch class notation of the song. It ranges from 0 to 11; # - mode: the modality of a track (0 => minor, 1 => major); # - tempo: the overall estimated tempo of a track in beats per minute (BPM); # - time_signature: An estimated overall time signature of a track (how many beats are in each bar or measure). # # The results of all these calls have been put inside Pandas dataframes in order to simplify the data analysis and then merged in a single dataframe using artist IDs and track IDs. # Some values (like song/artist popularity and tempo) have been normalized. # In[ ]: username = "eric.bonfadini" playlist_name = "Indie Rock" # Get token scope = 'playlist-modify-public' spotify_token = util.prompt_for_user_token(username, scope) # Create client sp = spotipy.Spotify(auth=spotify_token) # In[ ]: # Get playlist ID playlists_results = sp.user_playlists(username) playlist_ids = [playlist['id'] for playlist in playlists_results['items'] if playlist['name'] == playlist_name] if not playlist_ids: raise Exception("Cannot find playlist named: {}".format(playlist_name)) else: print(playlist_ids) # In[ ]: # Get tracks tracks_results = sp.user_playlist(username, playlist_ids[0]) df_tracks = pd.DataFrame([[t["track"]["id"], t["track"]["name"], t["track"]["artists"][0]["id"], t["track"]["artists"][0]["name"], t["track"]["album"]["name"], t["track"]["popularity"]] for t in tracks_results['tracks']['items']], columns=["id", "song_name", "artist_id", "artist_name", "album_name", "popularity"]) # Normalize popularity df_tracks["popularity_norm"] = df_tracks["popularity"] / 100. df_tracks.head() # In[ ]: def _get_artists_df(sp, artist_ids): """ This is an helper method to get artist's information with pagination from artist ids. It returns a Pandas dataframe """ artist_list = [] i = 0 while artist_ids: print("Call #{} for artists".format(i + 1)) artists_results = sp.artists(artist_ids[:API_LIMIT]) artist_list += [[t["id"], t["genres"], t["popularity"]] for t in artists_results["artists"]] artist_ids = artist_ids[API_LIMIT:] i += 1 df_artists = pd.DataFrame(artist_list, columns=["artist_id", "artist_genres", "artist_popularity"]) df_artists["artist_popularity_norm"] = df_artists["artist_popularity"] / 100. return df_artists # In[ ]: artist_ids = df_tracks["artist_id"].unique().tolist() df_artists = _get_artists_df(sp, artist_ids) df_artists.head() # In[ ]: def _get_features_df(sp, track_ids): """ This is an helper method to get track's features with pagination from track ids. It returns a Pandas dataframe """ feature_list = [] i = 0 while track_ids: print("Call #{} for audio features".format(i + 1)) features_results = sp.audio_features(track_ids[:API_LIMIT]) feature_list += features_results track_ids = track_ids[API_LIMIT:] i += 1 df_features = pd.DataFrame(feature_list)[["id", "analysis_url", "duration_ms", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "valence", "speechiness", "key", "mode", "tempo", "time_signature"]] # tempo is in range 24-200 ==> 0-176, normalize it df_features["tempo_norm"] = (df_features["tempo"] - 24) / 176. return df_features # In[ ]: track_ids = df_tracks["id"].unique().tolist() df_features = _get_features_df(sp, track_ids) df_features.head() # In[ ]: # Create a df for current playlist merging the dataframes df_cur = df_features.merge(df_tracks, on="id") df_cur = df_cur.merge(df_artists, on="artist_id") # Create a new column with full name of the song df_cur["full_name"] = df_cur["artist_name"] + " -- " + df_cur["song_name"] # Sort by song popularity df_cur.sort_values("popularity", inplace=True, ascending=False) df_cur.head() # # Explorative Data Analysis # After ensuring that the artists in my playlist all contain "Indie Rock" as genre, I checked my playlist using shape, info and describe of the full dataframe # In[ ]: print(df_cur.info()) df_cur.describe() # In[ ]: # Convert time_signature and key to category df_cur["time_signature"] = df_cur["time_signature"].astype(pd.api.types.CategoricalDtype(categories=[1, 2, 3, 4, 5])) df_cur["key"] = df_cur["key"].astype(pd.api.types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])) # In[ ]: def _distplot(df, key, label, x_limits): """ This is an helper method to plot distribution charts """ ax = sns.distplot(df[[key]], bins=30, label=label) if x_limits is not None: ax.set_xlim(*x_limits) plt.title(key) plt.legend() plt.show() # In[ ]: x_limits = {"duration_ms": None, "loudness": (-60, 0), "tempo": (24, 200), "popularity": (0, 100), "artist_popularity": (0, 100)} for key in ["duration_ms", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "valence", "speechiness", "tempo", "popularity", "artist_popularity"]: _distplot(df_cur, key, label="My Playlist", x_limits=x_limits.get(key, (0, 1))) # In[ ]: def _countplot(df, key, label): """ This is an helper method to plot count charts """ ax = sns.countplot(data=df, x=key, palette="tab20") ax.set_title(label) plt.show() # In[ ]: for key in ["key", "time_signature", "mode"]: _countplot(df_cur, key, label="My Playlist") # In[ ]: ax = sns.boxplot(data=df_cur[["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence", "speechiness", "artist_popularity_norm", "popularity_norm", "tempo_norm"]]) ax.set_title("My Playlist") plt.show() # All these graphs show that I like songs with low acousticness/instrumentalness/speechiness, high energy/loudness/tempo, high artist popularity and duration of more or less 200 seconds. # Valence and song popularity span on a wide range, meaning that I have in my playlist both well-known and unknown songs, and both positive and negative ones. # # But how my playlist compare against the Indie Rock genre? # # Comparing my playlist with a sample of the genre # I used some calls to the search API with 'genre:"Indie Rock"' as a keyword and 'type=tracks' to get a sample of the Indie Rock genre (5000 songs in total). # This API offers also some nice keywords like 'tag:hipster' (to get only albums with the lowest 10% popularity) or 'year:1980-2020' (to get only tracks released in a specific year range). # In[ ]: number_of_tracks = 5000 genre = "indie rock" search_runs = int(number_of_tracks / API_LIMIT) search_list = [] for i in range(search_runs): print("Call #{} for tracks".format(i+1)) search_results = sp.search('genre:"{}"'.format(genre), type="track", limit=API_LIMIT, offset=API_LIMIT*i) search_list += [[t["id"], t["name"], t["artists"][0]["id"], t["artists"][0]["name"], t["album"]["name"], t["popularity"]] for t in search_results['tracks']['items']] df_search = pd.DataFrame(search_list, columns=["id", "song_name", "artist_id", "artist_name", "album_name", "popularity"]) df_search["popularity_norm"] = df_search["popularity"] / 100. df_search.head() # In[ ]: track_ids = df_search["id"].unique().tolist() df_features = _get_features_df(sp, track_ids) df_features.head() # In[ ]: artist_ids = df_search["artist_id"].unique().tolist() df_artists = _get_artists_df(sp, artist_ids) df_artists.head() # In[ ]: df_sample = df_features.merge(df_search, on="id") df_sample = df_sample.merge(df_artists, on="artist_id") df_sample["full_name"] = df_sample["artist_name"] + " -- " + df_sample["song_name"] df_sample.sort_values("popularity", inplace=True, ascending=False) df_sample.head() # In[ ]: # Convert time_signature and key to category df_sample["time_signature"] = df_sample["time_signature"].astype(pd.api.types.CategoricalDtype(categories=[1, 2, 3, 4, 5])) df_sample["key"] = df_sample["key"].astype(pd.api.types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])) # In[ ]: print(df_sample.info()) df_sample.describe() # Then I repeated the same analysis on both my current playlist and the Indie Rock sample and I got the following charts: # In[ ]: def _distplot2(df, df_other, key, labels, x_limits): ax = sns.distplot(df[[key]], bins=30, label=labels[0]) if x_limits is not None: ax.set_xlim(*x_limits) ax = sns.distplot(df_other[[key]], bins=30, label=labels[1]) if x_limits is not None: ax.set_xlim(*x_limits) plt.title(key) plt.legend() plt.show() def _countplot2(df, df_other, key, labels): fig, ax = plt.subplots(1, 2) sns.countplot(data=df, x=key, ax=ax[0], palette="tab20") ax[0].set_title(labels[0]) sns.countplot(data=df_other, x=key, ax=ax[1], palette="tab20") ax[1].set_title(labels[1]) plt.show() # In[ ]: for key in ["duration_ms", "acousticness", "danceability", "energy", "instrumentalness", "liveness", "loudness", "valence", "speechiness", "tempo", "popularity", "artist_popularity"]: _distplot2(df_cur, df_sample, key, labels=["My Playlist", "5000 Indie rock songs"], x_limits=x_limits.get(key, (0, 1))) for key in ["key", "time_signature", "mode"]: _countplot2(df_cur, df_sample, key, labels=["My Playlist", "5000 Indie rock songs"]) # In[ ]: fig, ax = plt.subplots(2, 1) sns.boxplot(data=df_cur[["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence", "speechiness", "artist_popularity_norm", "popularity_norm", "tempo_norm"]], ax=ax[0]) ax[0].set_title("My Playlist") sns.boxplot(data=df_sample[["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence", "speechiness", "artist_popularity_norm", "popularity_norm", "tempo_norm"]], ax=ax[1]) ax[1].set_title("5000 Indie rock songs") plt.show() # The graphs show that my playlist differs from the 5000 Indie Rock songs because: # - I like shorter songs # - I like songs with higher energy/loudness/tempo # - I don't like songs with too negative mood (valence > 0.3) # - I like songs mostly in key (0, 1, 6, 9) # # The boxplot confirms the same insights and I agree with the outcome of this analysis. # # Creating a new playlist with songs I potentially like # Using these insights, I applied some filters to the 5000 songs dataframe in order to keep only tracks I potentially like; for each step I logged the dropped songs to double check the filter behavior. # # The first filter I applied was removing songs I already had in my original playlist, obviously. # # The other filters were: # - acousticness < 0.1 # - energy > 0.75 # - loudness > -7dB # - valence between 0.3 and 0.9 # - tempo > 120 # - key in (0, 1, 6, 9) # - duration between 10% quartile of original playlist duration and 90% quartile (178s and 280s) # In[ ]: def _apply_condition(df, condition, label): before = len(df) dropped_songs = df[~condition]["full_name"].head().tolist() df = df[condition] print("\ncondition [{}]: {}-{}={}".format(label, before, before - len(df), len(df))) print("first 10 dropped songs: {}".format(dropped_songs)) return df # In[ ]: df_new = df_sample.drop_duplicates(["full_name"], keep="first") # In[ ]: df_new = _apply_condition(df_new, condition=~(df_new["full_name"]).isin((df_cur["full_name"]).tolist()), label="name") df_new = _apply_condition(df_new, condition=(df_new["acousticness"] < 0.1), label="acousticness") df_new = _apply_condition(df_new, condition=(df_new["energy"] > 0.75), label="energy") df_new = _apply_condition(df_new, condition=(df_new["loudness"] > -7), label="loudness") df_new = _apply_condition(df_new, condition=(df_new["valence"].between(0.3, 0.9)), label="valence") df_new = _apply_condition(df_new, condition=(df_new["tempo"] > 120), label="tempo") df_new = _apply_condition(df_new, condition=(df_new["key"].isin([9, 0, 1, 6])), label="key") df_new = _apply_condition(df_new, condition=(df_new["duration_ms"].between(*df_cur["duration_ms"].quantile([0.1, 0.9]))), label="duraton_ms") df_new.head() # In the end I got a dataframe with 220 tracks and I created a new playlist using an API call # In[ ]: playlist_name_new = "Indie Rock Trial" playlists = sp.user_playlists(username) playlist_ids = [playlist['id'] for playlist in playlists['items'] if playlist['name'] == playlist_name_new] if not playlist_ids: playlists = sp.user_playlist_create(username, playlist_name_new) playlist_id = playlists["id"] else: playlist_id = playlist_ids[0] results = sp.user_playlist(username, playlist_id) track_ids = [t["track"]["id"] for t in results["tracks"]["items"]] results = sp.user_playlist_remove_all_occurrences_of_tracks(username, playlist_id, track_ids) print(results) track_ids = df_new["id"].unique().tolist() while track_ids: results = sp.user_playlist_add_tracks(username, playlist_id, track_ids[:API_LIMIT]) print(results) track_ids = track_ids[API_LIMIT:] # # Conclusions and future steps # After a few days listening to the new playlist, I'm quite happy with the results and I'm already promoting some tracks to my original playlist. # # This "recommendation" method is really simple and probably works well only in this specific use case (i.e. it's a well definable subset of a specific genre). # The standard recommendation method of Spotify is obviously much better because, apart from audio analysis, it uses also a mix of Collaborative Filtering models (analyzing your behavior and others’ behavior) and Natural Language Processing (NLP) models (analyzing text of the songs). # # Next steps: # - Run the analysis again after a few months, in order to take into account new entries in my playlist and new songs in the 5000 sample # - Enrich the information I already got with something new using the Spotify APIs (e.g the Audio Analysis endpoint or other services. It would be nice, as an example, to detect musical instruments in a track (guitars anyone??) or the presence of some features like distortion, riffs, etc. # - Use the preview sample from the API to tag manually what I like/dislike on a subset of the 5000 songs and then run some ML algorithms in order to classify the music I like # - Analyze deeper my playlist using some ML algorithms (e.g. cluster my tracks)