#!/usr/bin/env python # coding: utf-8 # # Spotify Data Exploration: the Popularity Feature # # ## Intro: # # After retrieving some data from the Spotify API (for more info about that check out [this notebook](https://github.com/tgel0/spotify-data/blob/master/notebooks/SpotifyDataRetrieval.ipynb)) it's time to get some insights. In this notebook, I will use data collected during the months of August and September 2018 to identify the most popular tracks and artists on Spotify using the 'popularity' featue. # # ## About the Popularity Feature: # # From the [official Spotify docs](https://developer.spotify.com/documentation/web-api/reference/search/search/): # >"The popularity of the track. The value will be between 0, for least popular, and 100 for most popular. # The popularity of a track is a value between 0 and 100, with 100 being the most popular. Popularity is based mainly on the total number of playbacks. Duplicate tracks, such as both in a single and in an album, are popularity rated differently. # Note: This value is not updated in real-time and may therefore lag behind in actual popularity." # # ## Goal of this Notebook: # # The goal is to use the previously retrieved data to gain insights from the popularity feature such as most popular tracks and most popular artists by analyzing and visualizing the data using Python libraries Pandas, Numpy and Matplotlib. # In[1]: # import libraries import glob, os import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # get all csv files into one variable path = 'Datasets/Summer2018' all_files = glob.glob(os.path.join(path, "*.csv")) # create lists of columns to be used when reading/merging the csv's columns = ['artist_name','track_id', 'track_name', 'popularity'] merge_columns = ['artist_name','track_id', 'track_name'] # create dataframes by reading the csv's in all_files df_from_each_file = (pd.read_csv(f, usecols=columns) for f in all_files) # create empty dataframe with the defined column structure df = pd.DataFrame(columns=columns) # loop over dataframes and merge into one dataframe # outer join in order to keep the popularity column from each file for df_, files in zip(df_from_each_file, all_files): # all_files are here to provide the column suffix (0920,0830 etc) df = df.merge(df_, how='outer', on=merge_columns, suffixes=('',str(files)[-8:-4])) print('Shape: ', df.shape) df.head() # Since I have merged 3 files based on artist and track names there shouldn't be a lot duplicates. # # However, it is still worth to do a quick drop_duplicates here. # In[2]: # drop duplicate tracks df.drop_duplicates(subset=['artist_name','track_name'], inplace=True) print('Shape after dropping: ', df.shape) # # 1. Top 50 most Popular Tracks # In[3]: # sum individual popularity scores df['popularity'] = df[['popularity0920', 'popularity0830', 'popularity0807']].sum(axis=1) # calculate also the mean popularity score df['popularity_mean'] = df[['popularity0920', 'popularity0830', 'popularity0807']].mean(axis=1) # create new dataframe df_top ordered consisting of the 100 most popular tracks df_top = df.sort_values('popularity', ascending=False).head(100) # show the first 50 results df_top[['artist_name', 'track_name', 'popularity', 'popularity_mean']].head(50) # # 2. Top Artists by Popularity # # Note: the Spotify API offers a special popularity score on artist-level as well. That score is not used here. # # Instead, I have used only the popularity scores of their individual tracks. # In[4]: # show top 20 artists by number of tracks in top 100 df_top[['artist_name','track_name']].groupby('artist_name').count().sort_values('track_name', ascending=False).head(20) # In[5]: # show top 20 artists by total popularity of their tracks in top 100 df_top[['artist_name','popularity']].groupby('artist_name').sum().sort_values('popularity', ascending=False).head(20) # # 4. Visualizing Popularity # # For this visualization I borrowed the code from another project of mine - [Twitter 10k (plot number 5)](https://github.com/tgel0/twitter-10k/blob/master/Twitter10k.ipynb). # In[6]: # create a new transposed dataframe where the track names are the columns and individual popularities the rows df_top10_pop = df_top[['track_name','popularity0807','popularity0830','popularity0920']].set_index('track_name').head(10).T # set the figure size plt.figure(figsize=(12,18)) # create a color palette palette = plt.get_cmap('Set1') # multiple line plot of the top10 track popularities num=0 for track in df_top10_pop.columns: num+=1 # find the right spot on the plot plt.subplot(10,1, num) # plot the individual popularities df_top10_pop.loc[['popularity0807', 'popularity0830', 'popularity0920'],track].plot(marker='', color=palette(num), linewidth=2.5) # same limits for every subplot plt.ylim(90,100) # get current position of the ticks locs, labels = plt.xticks() # add ticks with custom labels mylabels = ['','7th August', '','', '','30th August', '','','', '20th September'] # a bit ugly but it works plt.xticks(locs, mylabels) # not ticks everywhere if num in range(10) : plt.tick_params(labelbottom=False) # add title plt.title(track, loc='left', fontsize=10, fontweight=0, color=palette(num)) # add general title plt.suptitle("Popularity of Top 10 Tracks during Summer 2018", fontsize=13, fontweight=0, color='black', style='italic');