#!/usr/bin/env python
# coding: utf-8

# This notebook contains some code snippets I used in writing the following blog post: https://dev.to/ericbonfadini/finding-my-new-favorite-song-on-spotify-4lgc

# While I'm developing (or just while I'm commuting to work) I usually love to hear some rock music.
# 
# I created some playlists on Spotify, but lately I'm stick to the same playlist, containing my favorite "Indie Rock" songs.
# This playlist is made up of more or less 45 songs I discovered through the years in several ways.
# 
# Since I was starting to get bored about always listening to the same songs, last weekend I decided to analyze my playlist using Spotify APIs in order to discover insights and hopefully to find some new tunes I could add.
# 
# Here's what I did in more or less 300 lines of Python 3 code (boilerplate included).

# # Setting up the environment

# For my analysis I set up a Python 3 virtual environment with the following libraries:
# - Pandas for data analysis
# - Seaborn for data visualization
# - Spotipy for interaction with Spotify APIs
# 
# In order to access the Spotify APIs I registered my app and then I provided the spotipy library with the client_id, the client_secret and a redirect url.

# In[ ]:


import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import spotipy
from matplotlib import style
from spotipy import util

# Improve Pandas display settings
pd.set_option("display.width", 600)
pd.set_option("display.max_columns", 50)

# Change Seaborn default settings
sns.set_context('talk')
style.use('ggplot')

# Spotify API limit
API_LIMIT = 50


# # Analyzing my playlist tracks

# After a first API call to get my playlist id, I got all the tracks of my playlist with another API call along with some basic information like: song id, song name, artist id, artist name, album name, song popularity.
# 
# With another API call I got some extra information about the artists in my playlist, like genres and artist_popularity.
# 
# Finally with another API call I got some insightful information about my tracks:
# - duration_ms: the duration of the track in milliseconds;
# - acousticness: describes the acousticness of a song (1 => high confidence the track is acoustic). It ranges from 0 to 1;
# - danceability: describes the danceability of a song (1 => high confidence the track is danceable). It ranges from 0 to 1;
# - energy: it's a perceptual measure of intensity and activity (e.g. death metal has high energy while classical music has low energy). It ranges from 0 to 1;
# - instrumentalness: predicts whether a track contains no vocals (1 => high confidence the track has no vocals). It ranges from 0 to 1;
# - liveness: detects the presence of an audience in the recording (1 => high confidence the track is live). It ranges from 0 to 1;
# - loudness: detects the overall loudness of a track in decibels. It ranges from -60dB to 0dB;
# - valence: describes the musical positiveness conveyed by a track (1 => more positive, 0 => more negative). It ranges from 0 to 1;
# - speechiness: detects the presence of spoken words in a track (1 => speech, 0 => non speech, just music). It ranges from 0 to 1;
# - key: describes the pitch class notation of the song. It ranges from 0 to 11;
# - mode: the modality of a track (0 => minor, 1 => major);
# - tempo: the overall estimated tempo of a track in beats per minute (BPM);
# - time_signature: An estimated overall time signature of a track (how many beats are in each bar or measure).
# 
# The results of all these calls have been put inside Pandas dataframes in order to simplify the data analysis and then merged in a single dataframe using artist IDs and track IDs.
# Some values (like song/artist popularity and tempo) have been normalized.

# In[ ]:


username = "eric.bonfadini"
playlist_name = "Indie Rock"

# Get token
scope = 'playlist-modify-public'
spotify_token = util.prompt_for_user_token(username, scope)

# Create client
sp = spotipy.Spotify(auth=spotify_token)


# In[ ]:


# Get playlist ID
playlists_results = sp.user_playlists(username)
playlist_ids = [playlist['id'] for playlist in playlists_results['items'] if playlist['name'] == playlist_name]

if not playlist_ids:
    raise Exception("Cannot find playlist named: {}".format(playlist_name))
else:
    print(playlist_ids)


# In[ ]:


# Get tracks
tracks_results = sp.user_playlist(username, playlist_ids[0])

df_tracks = pd.DataFrame([[t["track"]["id"], t["track"]["name"], t["track"]["artists"][0]["id"], 
                           t["track"]["artists"][0]["name"], t["track"]["album"]["name"], t["track"]["popularity"]] 
                          for t in tracks_results['tracks']['items']], 
                         columns=["id", "song_name", "artist_id", "artist_name", "album_name", "popularity"]) 
# Normalize popularity
df_tracks["popularity_norm"] = df_tracks["popularity"] / 100.

df_tracks.head()


# In[ ]:


def _get_artists_df(sp, artist_ids):
    """
    This is an helper method to get artist's information with pagination from artist ids.
    It returns a Pandas dataframe
    """

    artist_list = []
    i = 0

    while artist_ids:
        print("Call #{} for artists".format(i + 1))
        artists_results = sp.artists(artist_ids[:API_LIMIT])

        artist_list += [[t["id"], t["genres"], t["popularity"]] for t in artists_results["artists"]]

        artist_ids = artist_ids[API_LIMIT:]
        i += 1

    df_artists = pd.DataFrame(artist_list, columns=["artist_id", "artist_genres", "artist_popularity"])

    df_artists["artist_popularity_norm"] = df_artists["artist_popularity"] / 100.

    return df_artists


# In[ ]:


artist_ids = df_tracks["artist_id"].unique().tolist()
df_artists = _get_artists_df(sp, artist_ids)
df_artists.head()


# In[ ]:


def _get_features_df(sp, track_ids):
    """
    This is an helper method to get track's features with pagination from track ids.
    It returns a Pandas dataframe
    """

    feature_list = []
    i = 0
    while track_ids:
        print("Call #{} for audio features".format(i + 1))
        features_results = sp.audio_features(track_ids[:API_LIMIT])

        feature_list += features_results
        
        track_ids = track_ids[API_LIMIT:]
        i += 1

    df_features = pd.DataFrame(feature_list)[["id", "analysis_url", "duration_ms", "acousticness", "danceability", 
                                              "energy", "instrumentalness", "liveness", "loudness", "valence", 
                                              "speechiness", "key", "mode", "tempo", "time_signature"]]
    # tempo is in range 24-200 ==> 0-176, normalize it
    df_features["tempo_norm"] = (df_features["tempo"] - 24) / 176.
    
    return df_features


# In[ ]:


track_ids = df_tracks["id"].unique().tolist()
df_features = _get_features_df(sp, track_ids)
df_features.head()


# In[ ]:


# Create a df for current playlist merging the dataframes
df_cur = df_features.merge(df_tracks, on="id")
df_cur = df_cur.merge(df_artists, on="artist_id")

# Create a new column with full name of the song
df_cur["full_name"] = df_cur["artist_name"] + " -- " + df_cur["song_name"]

# Sort by song popularity
df_cur.sort_values("popularity", inplace=True, ascending=False)

df_cur.head()


# # Explorative Data Analysis

# After ensuring that the artists in my playlist all contain "Indie Rock" as genre, I checked my playlist using shape, info and describe of the full dataframe

# In[ ]:


print(df_cur.info())

df_cur.describe()


# In[ ]:


# Convert time_signature and key to category
df_cur["time_signature"] = df_cur["time_signature"].astype(pd.api.types.CategoricalDtype(categories=[1, 2, 3, 4, 5]))
df_cur["key"] = df_cur["key"].astype(pd.api.types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))


# In[ ]:


def _distplot(df, key, label, x_limits):
    """
    This is an helper method to plot distribution charts
    """
    ax = sns.distplot(df[[key]], bins=30, label=label)
    if x_limits is not None:
        ax.set_xlim(*x_limits)
    plt.title(key)
    plt.legend()
    plt.show()


# In[ ]:


x_limits = {"duration_ms": None, "loudness": (-60, 0), "tempo": (24, 200), "popularity": (0, 100), 
            "artist_popularity": (0, 100)}

for key in ["duration_ms", "acousticness", "danceability", "energy", "instrumentalness", "liveness",
                    "loudness", "valence", "speechiness", "tempo", "popularity", "artist_popularity"]:
    _distplot(df_cur, key, label="My Playlist", x_limits=x_limits.get(key, (0, 1)))


# In[ ]:


def _countplot(df, key, label):
    """
    This is an helper method to plot count charts
    """
    ax = sns.countplot(data=df, x=key, palette="tab20")
    ax.set_title(label)
    plt.show()


# In[ ]:


for key in ["key", "time_signature", "mode"]:
    _countplot(df_cur, key, label="My Playlist")


# In[ ]:


ax = sns.boxplot(data=df_cur[["acousticness", "danceability", "energy", "instrumentalness", "liveness", 
                              "valence", "speechiness", "artist_popularity_norm", "popularity_norm", "tempo_norm"]])
ax.set_title("My Playlist")
plt.show()


# All these graphs show that I like songs with low acousticness/instrumentalness/speechiness, high energy/loudness/tempo, high artist popularity and duration of more or less 200 seconds.
# Valence and song popularity span on a wide range, meaning that I have in my playlist both well-known and unknown songs, and both positive and negative ones.
# 
# But how my playlist compare against the Indie Rock genre?

# # Comparing my playlist with a sample of the genre

# I used some calls to the search API with 'genre:"Indie Rock"' as a keyword and 'type=tracks' to get a sample of the Indie Rock genre (5000 songs in total).
# This API offers also some nice keywords like 'tag:hipster' (to get only albums with the lowest 10% popularity) or 'year:1980-2020' (to get only tracks released in a specific year range).

# In[ ]:


number_of_tracks = 5000
genre = "indie rock"

search_runs = int(number_of_tracks / API_LIMIT)

search_list = []
for i in range(search_runs):
    print("Call #{} for tracks".format(i+1))
    search_results = sp.search('genre:"{}"'.format(genre), type="track", limit=API_LIMIT, offset=API_LIMIT*i)

    search_list += [[t["id"], t["name"], t["artists"][0]["id"], t["artists"][0]["name"],
                            t["album"]["name"], t["popularity"]]
                           for t in search_results['tracks']['items']]

df_search = pd.DataFrame(search_list, 
                         columns=["id", "song_name", "artist_id", "artist_name", "album_name", "popularity"])
df_search["popularity_norm"] = df_search["popularity"] / 100.
df_search.head()


# In[ ]:


track_ids = df_search["id"].unique().tolist()
df_features = _get_features_df(sp, track_ids)
df_features.head()


# In[ ]:


artist_ids = df_search["artist_id"].unique().tolist()
df_artists = _get_artists_df(sp, artist_ids)
df_artists.head()


# In[ ]:


df_sample = df_features.merge(df_search, on="id")
df_sample = df_sample.merge(df_artists, on="artist_id")

df_sample["full_name"] = df_sample["artist_name"] + " -- " + df_sample["song_name"]
df_sample.sort_values("popularity", inplace=True, ascending=False)

df_sample.head()


# In[ ]:


# Convert time_signature and key to category
df_sample["time_signature"] = df_sample["time_signature"].astype(pd.api.types.CategoricalDtype(categories=[1, 2, 3, 4, 5]))
df_sample["key"] = df_sample["key"].astype(pd.api.types.CategoricalDtype(categories=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))


# In[ ]:


print(df_sample.info())

df_sample.describe()


# Then I repeated the same analysis on both my current playlist and the Indie Rock sample and I got the following charts:

# In[ ]:


def _distplot2(df, df_other, key, labels, x_limits):
   ax = sns.distplot(df[[key]], bins=30, label=labels[0])
   if x_limits is not None:
       ax.set_xlim(*x_limits)
   ax = sns.distplot(df_other[[key]], bins=30, label=labels[1])
   if x_limits is not None:
       ax.set_xlim(*x_limits)
   plt.title(key)
   plt.legend()
   plt.show()

def _countplot2(df, df_other, key, labels):
   fig, ax = plt.subplots(1, 2)
   sns.countplot(data=df, x=key, ax=ax[0], palette="tab20")
   ax[0].set_title(labels[0])
   sns.countplot(data=df_other, x=key, ax=ax[1], palette="tab20")
   ax[1].set_title(labels[1])
   plt.show()


# In[ ]:


for key in ["duration_ms", "acousticness", "danceability", "energy", "instrumentalness", "liveness",
            "loudness", "valence", "speechiness", "tempo", "popularity", "artist_popularity"]:
    _distplot2(df_cur, df_sample, key,
              labels=["My Playlist", "5000 Indie rock songs"],
              x_limits=x_limits.get(key, (0, 1)))

for key in ["key", "time_signature", "mode"]:
    _countplot2(df_cur, df_sample, key, labels=["My Playlist", "5000 Indie rock songs"])


# In[ ]:


fig, ax = plt.subplots(2, 1)
sns.boxplot(data=df_cur[["acousticness", "danceability", "energy", "instrumentalness", "liveness",
                         "valence", "speechiness", "artist_popularity_norm", "popularity_norm",
                         "tempo_norm"]], ax=ax[0])
ax[0].set_title("My Playlist")
sns.boxplot(data=df_sample[["acousticness", "danceability", "energy", "instrumentalness", "liveness",
                           "valence", "speechiness", "artist_popularity_norm", "popularity_norm",
                           "tempo_norm"]], ax=ax[1])
ax[1].set_title("5000 Indie rock songs")
plt.show()


# The graphs show that my playlist differs from the 5000 Indie Rock songs because:
# - I like shorter songs
# - I like songs with higher energy/loudness/tempo
# - I don't like songs with too negative mood (valence > 0.3)
# - I like songs mostly in key (0, 1, 6, 9)
# 
# The boxplot confirms the same insights and I agree with the outcome of this analysis.

# # Creating a new playlist with songs I potentially like

# Using these insights, I applied some filters to the 5000 songs dataframe in order to keep only tracks I potentially like; for each step I logged the dropped songs to double check the filter behavior.
# 
# The first filter I applied was removing songs I already had in my original playlist, obviously.
# 
# The other filters were:
# - acousticness < 0.1
# - energy > 0.75
# - loudness > -7dB
# - valence between 0.3 and 0.9
# - tempo > 120
# - key in (0, 1, 6, 9)
# - duration between 10% quartile of original playlist duration and 90% quartile (178s and 280s)

# In[ ]:


def _apply_condition(df, condition, label):
    before = len(df)
    dropped_songs = df[~condition]["full_name"].head().tolist()
    df = df[condition]
    print("\ncondition [{}]: {}-{}={}".format(label, before, before - len(df), len(df)))
    print("first 10 dropped songs: {}".format(dropped_songs))
    return df


# In[ ]:


df_new = df_sample.drop_duplicates(["full_name"], keep="first")


# In[ ]:


df_new = _apply_condition(df_new,
                          condition=~(df_new["full_name"]).isin((df_cur["full_name"]).tolist()),
                          label="name")

df_new = _apply_condition(df_new, 
                          condition=(df_new["acousticness"] < 0.1),
                          label="acousticness")

df_new = _apply_condition(df_new,
                          condition=(df_new["energy"] > 0.75),
                          label="energy")

df_new = _apply_condition(df_new,
                          condition=(df_new["loudness"] > -7),
                          label="loudness")

df_new = _apply_condition(df_new,
                          condition=(df_new["valence"].between(0.3, 0.9)),
                          label="valence")

df_new = _apply_condition(df_new,
                          condition=(df_new["tempo"] > 120),
                          label="tempo")

df_new = _apply_condition(df_new,
                          condition=(df_new["key"].isin([9, 0, 1, 6])),
                          label="key")

df_new = _apply_condition(df_new,
                          condition=(df_new["duration_ms"].between(*df_cur["duration_ms"].quantile([0.1, 0.9]))),
                          label="duraton_ms")

df_new.head()


# In the end I got a dataframe with 220 tracks and I created a new playlist using an API call

# In[ ]:


playlist_name_new = "Indie Rock Trial"

playlists = sp.user_playlists(username)

playlist_ids = [playlist['id'] for playlist in playlists['items'] if playlist['name'] == playlist_name_new]

if not playlist_ids:
    playlists = sp.user_playlist_create(username, playlist_name_new)
    playlist_id = playlists["id"]

else:
    playlist_id = playlist_ids[0]

    results = sp.user_playlist(username, playlist_id)
    track_ids = [t["track"]["id"] for t in results["tracks"]["items"]]
    results = sp.user_playlist_remove_all_occurrences_of_tracks(username, playlist_id, track_ids)
    print(results)

track_ids = df_new["id"].unique().tolist()

while track_ids:
    results = sp.user_playlist_add_tracks(username, playlist_id, track_ids[:API_LIMIT])
    print(results)
    track_ids = track_ids[API_LIMIT:]


# # Conclusions and future steps

# After a few days listening to the new playlist, I'm quite happy with the results and I'm already promoting some tracks to my original playlist.
# 
# This "recommendation" method is really simple and probably works well only in this specific use case (i.e. it's a well definable subset of a specific genre).
# The standard recommendation method of Spotify is obviously much better because, apart from audio analysis, it uses also a mix of Collaborative Filtering models (analyzing your behavior and others’ behavior) and Natural Language Processing (NLP) models (analyzing text of the songs).
# 
# Next steps:
# - Run the analysis again after a few months, in order to take into account new entries in my playlist and new songs in the 5000 sample
# - Enrich the information I already got with something new using the Spotify APIs (e.g the Audio Analysis endpoint or other services. It would be nice, as an example, to detect musical instruments in a track (guitars anyone??) or the presence of some features like distortion, riffs, etc.
# - Use the preview sample from the API to tag manually what I like/dislike on a subset of the 5000 songs and then run some ML algorithms in order to classify the music I like
# - Analyze deeper my playlist using some ML algorithms (e.g. cluster my tracks)