#!/usr/bin/env python # coding: utf-8 # # Spotify's Worldwide Daily Song Ranking # # Dataset from: https://www.kaggle.com/edumucelli/spotifys-worldwide-daily-song-ranking # Crawled from: https://spotifycharts.com/regional # Crawler code is hosted on github: https://github.com/edumucelli/spotify-worldwide-ranking # # The original dataset contains only records of 2017. I modified the scripts found on github, to crawl the data of 2018, up to September. # In[1]: import pandas as pd import json import matplotlib.pyplot as plt # ## Load data # In[2]: df_ranking = pd.read_csv('./spotify_worldwide_daily_song_ranking_2017_2018.csv', parse_dates=['Date']) # In[3]: df_continents = pd.Series(json.load(open('./continents.json')), name='continent').to_frame() df_continents.index.name = 'code' df_continents.reset_index(level=0, inplace=True) # In[4]: df_countries = pd.read_json('./countries.json', orient='index') df_countries.index.name = 'code' df_countries.reset_index(level=0, inplace=True) # ## See some items in dataset # In[5]: df_ranking.head() # ### Statistical summary # In[6]: df_ranking.describe(include='all') # ### Rows contain missing data # In[7]: df_ranking[df_ranking.isnull().any(axis=1)].head() # In[8]: df_ranking.isnull().sum() # ### Drop missing data # In[9]: df_ranking.dropna(inplace=True) # In[10]: df_ranking.describe(include='all') # ### Simple plot # In[11]: fig = plt.figure(figsize=(16, 4)) df_ranking['Region'].value_counts().plot(kind='bar') # In[12]: print(df_ranking['Region'].value_counts().tail(30)) # See what's happening in the data of region 'lu' # In[13]: fig = plt.figure(figsize=(16, 4)) df_ranking.loc[df_ranking['Region'] == 'lu']['Date'].value_counts().plot(kind='line', style='.-') # In[14]: df_continents.head() # In[15]: df_countries.head() # ## A little bit preprocessing # In[16]: df_region = pd.merge(df_countries, df_continents, right_on='code', left_on='continent') df_region = df_region[['code_x', 'name', 'code_y', 'continent_y']] df_region.columns = ['country_code', 'country', 'continent_code', 'continent'] df_region = df_region.append({ 'country_code': 'GLOBAL', 'country': 'GLOBAL', 'continent_code': 'GLOBAL', 'continent': 'GLOBAL' }, ignore_index=True) df_region['country_code'] = df_region['country_code'].str.lower() # In[17]: df_region.tail() # In[18]: df_ranking.head() # In[19]: df_ranking_region = pd.merge(df_ranking, df_region, left_on='Region', right_on='country_code') # In[20]: df_ranking_region.head() # ### Extract more columns from 'Date' column # In[21]: df_date = df_ranking_region[['Date']] df_date = df_date.drop_duplicates() df_date = df_date.reset_index(drop=True) df_date = df_date.sort_values(['Date']) # In[22]: df_date.head() # In[23]: def year (d): return f'Y{d.year}' print(year(df_date['Date'][0])) # In[24]: def monthofyear (d): return 'M'.join((str(d.year), str(d.month).zfill(2))) print(monthofyear(df_date['Date'][0])) # In[25]: def quarterofyear (d): return 'Q'.join((str(d.year), str(d.quarter))) print(quarterofyear(df_date['Date'][0])) # In[26]: def weekofyear (d): if d.month == 1 and d.week == 52: return 'W'.join((str(d.year - 1), str(d.week).zfill(2))) else: return 'W'.join((str(d.year), str(d.week).zfill(2))) print(weekofyear(df_date['Date'][0])) print(weekofyear(df_date['Date'][10])) # In[27]: df_date.count() # In[28]: df_date['Year'] = df_date['Date'].apply(lambda x: year(x)) # In[29]: df_date['Quarterofyear'] = df_date['Date'].apply(lambda x: quarterofyear(x)) # In[30]: df_date['Monthofyear'] = df_date['Date'].apply(lambda x: monthofyear(x)) # In[31]: df_date['Weekofyear'] = df_date['Date'].apply(lambda x: weekofyear(x)) # In[32]: df_date.head() # In[33]: df_date.tail() # In[34]: df_ranking = df_ranking_region.merge(df_date, on='Date', how='left') # df_ranking_2017 = df_ranking.loc[df_ranking['Year'] == 'Y2017'] # df_ranking_2017.tail() # ### Unique songs # In[35]: df_songs = df_ranking[['URL', 'Track Name', 'Artist']].drop_duplicates(['URL']) df_songs.count() # ## Aggregate stream counts # In[36]: df_ranking_continent = df_ranking.groupby(['URL', 'continent', 'Date']).sum() df_ranking_continent = df_ranking_continent.reset_index() df_ranking_continent = df_ranking_continent.merge(df_date, on='Date', how='left') df_ranking_continent.head() # In[37]: df_ranking_continent_daily = df_ranking_continent.merge(df_songs, on='URL', how='left') # In[38]: df_ranking_continent_daily.sort_values(['Streams'], ascending=[False]).head() # In[39]: df_ranking_continent_yearly = df_ranking_continent_daily.groupby(['URL', 'continent', 'Year'])['Streams'].sum() df_ranking_continent_yearly = df_ranking_continent_yearly.reset_index() df_ranking_continent_yearly = df_ranking_continent_yearly.merge(df_songs, on='URL', how='left') df_ranking_continent_yearly.head() # In[40]: df_ = df_ranking_continent_yearly[df_ranking_continent_yearly['continent'] == 'GLOBAL'] df_.sort_values(['Streams'], ascending=[False]).head() # In[41]: df_ranking_continent_weekly = df_ranking_continent.groupby(['URL', 'continent', 'Weekofyear'])['Streams'].sum() df_ranking_continent_weekly = df_ranking_continent_weekly.reset_index() df_ranking_continent_weekly = df_ranking_continent_weekly.merge(df_songs, on='URL', how='left') df_ranking_continent_weekly.head() # In[42]: df_ = df_ranking_continent_weekly[df_ranking_continent_weekly['continent'] == 'GLOBAL'] df_.sort_values(['Streams'], ascending=[False]).head(20) # In[43]: df_songs[df_songs['Track Name'] == 'Something Just Like This'] # In[44]: df_ = df_ranking_continent_weekly[df_ranking_continent_weekly['Track Name'] == "Something Just Like This"] df_.sort_values(['Weekofyear'], ascending=[True]).head(20) # In[45]: df_ranking_continent_weekly.to_csv('ranking_continent_weekly.csv', index=False) # In[46]: df_sjlt_weekly = df_ranking_continent_weekly[df_ranking_continent_weekly['Track Name'] == "Something Just Like This"] df_sjlt_weekly = df_sjlt_weekly.sort_values(['Weekofyear', 'Streams'], ascending=[True, False]) # In[47]: df_sjlt_weekly.to_csv('something_just_like_this_weekly.csv', index=False) # In[48]: df_sjlt_daily = df_ranking_continent_daily[df_ranking_continent_daily['Track Name'] == "Something Just Like This"] df_sjlt_daily = df_sjlt_daily.sort_values(['Weekofyear', 'Streams'], ascending=[True, False]) # In[49]: df_sjlt_daily.to_csv('something_just_like_this_daily.csv', index=False) # In[ ]: