#!/usr/bin/env python # coding: utf-8 # # Analyze and visualize last.fm listening history # # To see this analysis live, check out my article ["Analyzing Last.fm Listening History"](http://geoffboeing.com/2016/05/analyzing-lastfm-history/) # # The csv data files were created with [lastfm_downloader.ipynb](lastfm_downloader.ipynb) # In[106]: import pandas as pd, numpy as np, string, re, pytz import matplotlib.pyplot as plt, matplotlib.font_manager as fm from datetime import datetime as dt get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use('ggplot') plt.rcParams['font.family'] = 'Myriad Pro' plt.rcParams['text.color'] = '#555555' print(plt.style.available) # In[107]: # define the fonts to use for plots family = 'Myriad Pro' title_font = fm.FontProperties(family=family, style='normal', size=20, weight='normal', stretch='normal') label_font = fm.FontProperties(family=family, style='normal', size=16, weight='normal', stretch='normal') ticks_font = fm.FontProperties(family=family, style='normal', size=12, weight='normal', stretch='normal') ticks_font_h = fm.FontProperties(family=family, style='normal', size=10.5, weight='normal', stretch='normal') # In[108]: def get_colors(cmap, n, start=0., stop=1., alpha=1., reverse=False): '''return n-length list of rgba colors from the passed colormap name and alpha, limit extent by start/stop values and reverse list order if flag is true''' import matplotlib.cm as cm, numpy as np colors = [cm.get_cmap(cmap)(x) for x in np.linspace(start, stop, n)] colors = [(r, g, b, alpha) for r, g, b, _ in colors] return list(reversed(colors)) if reverse else colors # ## Top artists # In[166]: artists_most = pd.read_csv('data/lastfm_top_artists.csv', encoding='utf-8') artists_most = artists_most.set_index('artist')['play_count'].head(25) artists_most.head() # In[183]: ax = artists_most.sort_values(ascending=True).plot(kind='barh', figsize=[6, 10], width=0.85, alpha=0.6, color='#003399', edgecolor='w', grid=False) ax.xaxis.grid(True) #ax.set_xticklabels(artists_most.index, rotation=45, rotation_mode='anchor', ha='right') ax.set_title('Artists I have played the most') ax.set_ylabel('') ax.set_xlabel('Number of plays') ax.set_position([0.50, 0.05, 0.9-0.5, 0.95-0.05]) plt.savefig('images/lastfm-artists-played-most-h.png', dpi=96) # ## Top tracks # In[168]: tracks_most = pd.read_csv('data/lastfm_top_tracks.csv', encoding='utf-8') index = tracks_most.apply(lambda x: u'{}\n{}'.format(x['artist'], x['track']), axis='columns') tracks_most = tracks_most.set_index(index).drop(labels=['artist', 'track'], axis='columns') tracks_most = tracks_most['play_count'].head(20) tracks_most.head() # In[180]: ax = tracks_most.sort_values().plot(kind='barh', figsize=[6, 10], width=0.85, alpha=0.6, color='#003399', edgecolor='w', grid=False) ax.xaxis.grid(True) #for label in ax.get_xticklabels(): # label.set_fontproperties(ticks_font_h) #for label in ax.get_yticklabels(): # label.set_fontproperties(ticks_font_h) ax.set_xlabel('Number of plays') ax.set_ylabel('') ax.set_title('Songs I have played the most', y=1.005) ax.set_position([0.50, 0.05, 0.9-0.5, 0.95-0.05]) plt.savefig('images/lastfm-tracks-played-most-h.png', dpi=96) plt.show() # ## Top albums # In[176]: albums_most = pd.read_csv('data/lastfm_top_albums.csv', encoding='utf-8') index = albums_most.apply(lambda x: u'{}\n{}'.format(x['artist'], x['album']), axis='columns') albums_most = albums_most.set_index(index).drop(labels=['artist', 'album'], axis='columns') albums_most = albums_most['play_count'].head(30) albums_most.head() # In[179]: ax = albums_most.sort_values().plot(kind='barh', figsize=[6, 15], width=0.85, alpha=0.6, color='#990066', edgecolor='w', grid=False) ax.xaxis.grid(True) #for label in ax.get_xticklabels(): # label.set_fontproperties(ticks_font_h) #for label in ax.get_yticklabels(): # label.set_fontproperties(ticks_font_h) ax.set_xlabel('Number of plays') ax.set_ylabel('') ax.set_title('Albums I have played the most', y=1.005) ax.set_position([0.50, 0.05, 0.9-0.5, 0.95-0.05]) plt.savefig('images/lastfm-albums-played-most-h.png', dpi=96) plt.show() # ## All-time scrobbles # In[115]: # read the all-time scrobbles data set scrobbles = pd.read_csv('data/lastfm_scrobbles.csv', encoding='utf-8') scrobbles = scrobbles.drop('timestamp', axis=1) print '{:,} total scrobbles'.format(len(scrobbles)) print '{:,} total artists'.format(len(scrobbles['artist'].unique())) # According to the last.fm web site, I have ~13,970 scrobbles. This API endpoint seems to miss the ones from before 2012. # In[116]: # convert to datetime scrobbles['timestamp'] = pd.to_datetime(scrobbles['datetime'], dayfirst=True) # In[117]: # functions to convert UTC to Pacific time zone and extract date/time elements convert_tz = lambda x: x.to_datetime().replace(tzinfo=pytz.utc).astimezone(pytz.timezone('GMT')) get_year = lambda x: convert_tz(x).year get_month = lambda x: '{}-{:02}'.format(convert_tz(x).year, convert_tz(x).month) #inefficient get_day = lambda x: convert_tz(x).day get_hour = lambda x: convert_tz(x).hour get_day_of_week = lambda x: convert_tz(x).weekday() # These lambda functions are inefficient, but necessary to workaround this bug: https://github.com/pydata/pandas/issues/11757. I can update them when the bug is fixed in a future pandas version. # In[118]: # parse out date and time elements as pacific time scrobbles['year'] = scrobbles['timestamp'].map(get_year) scrobbles['month'] = scrobbles['timestamp'].map(get_month) scrobbles['day'] = scrobbles['timestamp'].map(get_day) scrobbles['hour'] = scrobbles['timestamp'].map(get_hour) scrobbles['dow'] = scrobbles['timestamp'].map(get_day_of_week) #scrobbles = scrobbles.drop(labels=['timestamp'], axis=1) # drop rows with 01-01-1970 as timestamp scrobbles = scrobbles[scrobbles['year'] > 1970] scrobbles.head() # All the last.fm timestamps of my scrobbles appear to be UTC, but the year, month, day, hour columns are now converted to Pacific Time (where I've done nearly all my listening) # # Exploration depth by artist # # Number of unique tracks heard by each artist # In[119]: ax = scrobbles \ .groupby(['artist', 'track']) \ .size() \ .groupby(level=0) \ .size() \ .sort_values(ascending=False) \ .head(40) \ .plot(kind='bar', figsize=[11, 7], width=0.85, alpha=0.6, color='#003399', edgecolor='w', grid=False) ax.yaxis.grid(True) ax.set_xticklabels(ax.get_xticklabels(), rotation=45, rotation_mode='anchor', ha='right') plt.gcf().set_size_inches(11, 7) plt.title('Number of unique tracks played by artist') plt.ylabel('Number of unique tracks heard') plt.xlabel('') plt.savefig('images/lastfm-uniqe-tracks-played-per-artist-most-h.png', dpi=96, bbox_inches='tight') # ## Timeseries analysis # In[120]: scrobbles.set_index('timestamp').resample('d').size().plot(alpha=.5) scrobbles.set_index('timestamp').resample('d').size().rolling(20).mean().shift(-10).plot() plt.gcf().set_size_inches(10,6) plt.gca().set_ylabel('Number of plays') plt.gca().set_xlabel('') plt.gca().set_title('Number of songs played timeseries') plt.savefig('images/lastfm-songs-played-timeseries.png', dpi=96, bbox_inches='tight') # In[121]: # Unique artist timeseries ax1 = plt.subplot(2,1,1) scrobbles.groupby('artist').first().set_index('timestamp').resample('d').size().plot(alpha=.5) scrobbles.groupby('artist').first().set_index('timestamp').resample('d').size().rolling(20).mean().shift(-10).plot() plt.gca().set_ylabel('Number of new artist plays') plt.gca().set_xlabel('') plt.gca().set_title('New artist exploration timeseries') ax2= plt.subplot(2,1,2, sharex=ax1) # Unique sond exploration timeseries scrobbles.groupby(['artist', 'track']).first().set_index('timestamp').resample('d').size().plot(alpha=.5) scrobbles.groupby(['artist', 'track']).first().set_index('timestamp').resample('d').size().rolling(20).mean().shift(-10).plot() plt.gca().set_ylabel('Number of new song plays') plt.gca().set_xlabel('') plt.gca().set_title('New song exploration timeseries') plt.gcf().set_size_inches(10,6) plt.tight_layout(rect=[0, 0, .85, 1]) plt.savefig('images/lastfm-new-artists-and-tracks-timeseries.png', dpi=96, bbox_inches='tight') # In[122]: # Calendar view import calmap calmap.calendarplot(scrobbles.set_index('timestamp').resample('d').size(), cmap='Blues', yearlabel_kws={'color' : '#DDDDDD'}, vmax=50) plt.gcf().set_size_inches(10,12) plt.tight_layout() plt.savefig('images/lastfm-activity-calendar-view.png', dpi=96, bbox_inches='tight') # ## Visualize top artists over time # # How have the cumulative plays of most listened-to artists changed over time? # In[123]: # get one color per artist n = 6 top_artist_timeseries_over_observed_timeperiod = scrobbles[scrobbles.artist.isin(artists_most.head(n).index)].groupby(['timestamp', 'artist']).size().unstack().resample('d').sum().fillna(0).cumsum() starting_offset = artists_most.head(n) - top_artist_timeseries_over_observed_timeperiod.iloc[-1,:].T (top_artist_timeseries_over_observed_timeperiod + starting_offset).loc[:,artists_most.head(n).index].plot(alpha=1.0) plt.gca().set_xlabel('Year') plt.gca().set_ylabel('Cumulative number of plays') plt.gca().set_title('Cumulative number of plays per artist over time') plt.gcf().set_size_inches(10,6) plt.gca().legend(loc='upper right', bbox_to_anchor=(1.30,1.017)) plt.savefig('images/lastfm-scrobbles-top-artists-years.png', dpi=96, bbox_inches='tight') plt.show() # ## Day of the week analysis # # How many songs have been played on each day of the week? # In[124]: # get the play count sum by day of the week dow_counts = scrobbles['dow'].value_counts().sort_index() dow_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] # In[125]: ax = dow_counts.plot(kind='bar', figsize=[7, 5], width=0.65, alpha=0.6, color='#003399', edgecolor='w', grid=False) ax.yaxis.grid(True) ax.set_xticklabels(dow_counts.index, rotation=35, rotation_mode='anchor', ha='right') #for label in ax.get_yticklabels(): # label.set_fontproperties(ticks_font) ax.set_title('Songs played per day of the week') ax.set_xlabel('') ax.set_ylabel('Number of plays') plt.savefig('images/lastfm-scrobbles-per-weekday.png', dpi=96, bbox_inches='tight') plt.show() # ## Hour of the day analysis # # How many songs have been played during each hour of the day? # In[126]: hour_counts = scrobbles['hour'].value_counts().sort_index() ax = hour_counts.plot(kind='bar', figsize=[10, 5], alpha=0.6, color='#003399', grid=True) xlabels = hour_counts.index.map(lambda x: '{:02}:00'.format(x)) ax.set_xticks(range(len(xlabels))) ax.set_xticklabels(xlabels, rotation=45, rotation_mode='anchor', ha='right') ax.yaxis.grid(True) ax.set_ylabel('Number of plays') ax.set_xlabel('') ax.set_title('Number of songs played per hour of the day') plt.savefig('images/lastfm-scrobbles-per-hour.png', dpi=96, bbox_inches='tight') plt.show() # ## Analysis by day of the week + hour of the day # # How many songs have been played during each hour of the day on each day of the week? # In[127]: # get the play counts by hour of day and day of week weekday_hour_counts = scrobbles.groupby(['dow','hour']).count()['track'] hour_numbers = weekday_hour_counts.index.levels[1] day_numbers = weekday_hour_counts.index.levels[0] day_names = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'} # In[128]: # get one color per day of week colors = get_colors('Dark2', n=len(day_numbers), start=0.05, stop=0.95, reverse=True) #fig, ax = plt.subplots(figsize=[10, 6]) weekday_hour_counts.unstack(level=0).rename(columns=day_names).plot() xlabels = hour_numbers.map(lambda x: '{:02}:00'.format(x)) plt.gca().set_xticks(range(len(xlabels))) plt.gca().set_xticklabels(xlabels, rotation=45, rotation_mode='anchor', ha='right') plt.gca().yaxis.grid(True) plt.gca().set_ylabel('Number of plays') plt.gca().set_xlabel('') plt.gca().set_title('Number of songs played, by day of week and hour of day') plt.gca().legend(loc='upper right', bbox_to_anchor=(1.23,1.017)) plt.gcf().set_size_inches(10,6) plt.savefig('images/lastfm-scrobbles-days-hours.png', dpi=96, bbox_inches='tight') plt.show() # ## Analysis of a single year (or multiple years) # # Add additional years to the 'isin' list to query multiple years. # In[24]: scrobbles_year = scrobbles[scrobbles['year'].isin([2015])] len(scrobbles_year) # In[25]: # what artists did i play the most that year? artists_year = scrobbles_year['artist'].value_counts() artists_year = pd.DataFrame(artists_year).reset_index().rename(columns={'artist':'play count', 'index':'artist'}) artists_year.index = [n + 1 for n in artists_year.index] artists_year.head(10) # In[26]: # what tracks did i play the most that year? tracks_year = scrobbles_year.groupby(['artist', 'track']).count().sort_values('timestamp', ascending=False) tracks_year = tracks_year.reset_index().rename(columns={'timestamp':'play count'})[['artist', 'track', 'play count']] tracks_year.index = [n + 1 for n in tracks_year.index] tracks_year.head(10) # In[27]: # what albums did i play the most that year? albums_year = scrobbles_year.groupby(['artist', 'album']).count().sort_values('timestamp', ascending=False) albums_year = albums_year.reset_index().rename(columns={'timestamp':'play count'})[['artist', 'album', 'play count']] albums_year.index = [n + 1 for n in albums_year.index] # remove text in parentheses or brackets regex = re.compile(u'\\(.*\\)|\\[.*]') albums_year['album'] = albums_year['album'].map(lambda x: regex.sub(u'', x)) albums_year.head(10) # ## Analysis of a single month (or multiple months) # # Add additional months to the 'isin' list to query multiple months. # In[28]: scrobbles_month = scrobbles[scrobbles['month'].isin(['2014-02'])] len(scrobbles_month) # In[29]: # what artists did i play the most that month? artists_month = scrobbles_month['artist'].value_counts() artists_month = pd.DataFrame(artists_month).reset_index().rename(columns={'artist':'play count', 'index':'artist'}) artists_month.index = [n + 1 for n in artists_month.index] artists_month.head(10) # In[30]: # what tracks did i play the most that month? tracks_month = scrobbles_month.groupby(['artist', 'track']).count().sort_values('timestamp', ascending=False) tracks_month = tracks_month.reset_index().rename(columns={'timestamp':'play count'})[['artist', 'track', 'play count']] tracks_month.index = [n + 1 for n in tracks_month.index] tracks_month.head(10) # In[31]: # what albums did i play the most that month? albums_month = scrobbles_month.groupby(['artist', 'album']).count().sort_values('timestamp', ascending=False) albums_month = albums_month.reset_index().rename(columns={'timestamp':'play count'})[['artist', 'album', 'play count']] albums_month.index = [n + 1 for n in albums_month.index] albums_month.head(10) # ## Listening history of a specific artist, album, or track # # See the last 5 times I've listened to someone or something # In[170]: # when were the last 5 times I played something by My Bloody Valentine? scrobbles[scrobbles['artist'].str.contains('Tim Minchin')].head() # In[171]: # when were the last 5 times I played something off of Double Nickels on the Dime? scrobbles[scrobbles['album'].fillna('').str.contains('Americana')].head() # In[172]: # when were the last 5 times I played Personality Crisis? scrobbles[scrobbles['track'].str.contains('No Brakes')].head() # ## Characteristics of artists' names # # Text analysis of artist name strings # In[175]: # remove 'The ' preceding artist names, get unique set of names, then get first letter frequency artists_clean = scrobbles['artist'].str.replace('The ','') first_letters = pd.Series(artists_clean.unique()).map(lambda x: x.upper()[0]).value_counts() first_letters = first_letters[[c for c in string.uppercase]] # In[176]: # plot the frequency of artist names that begin with each letter ax = first_letters.plot(kind='bar', figsize=[10, 6], width=0.85, alpha=0.6, color='#339933', edgecolor='w') ax.yaxis.grid(True) ax.set_ylim((0, 40)) ax.set_xticklabels(first_letters.index, rotation=0, rotation_mode='anchor', ha='center', fontproperties=ticks_font) ax.set_title('Number of artist names that begin with each letter', fontproperties=title_font) ax.set_xlabel('First letter in name', fontproperties=label_font) ax.set_ylabel('Number of unique artists', fontproperties=label_font) plt.savefig('images/lastfm-artists-first-letter-count.png', dpi=96, bbox_inches='tight') plt.show() # In[177]: # which artist names begin with the letter 'O'? pd.Series(artists_clean[artists_clean.str.upper().str.startswith('O')].unique()) # In[178]: # what are the most common first words in artist names that begin with 'M'? artists_m = pd.Series(artists_clean[artists_clean.str.upper().str.startswith('M')].unique()) artists_m.map(lambda x: x.split()[0]).value_counts().head(15) # In[179]: # what are the most common first words in all the artist names? pd.Series(artists_clean.unique()).map(lambda x: x.split()[0].lower()).value_counts().head(15) # In[180]: # what are the most common words in all the artist names, anywhere in the name? word_list = [] stop_list = ['&','the','and','of','a','and','in','for','la','de'] for artist in artists_clean.unique(): for word in artist.split(): word_list.append(word.lower()) word_list = [word for word in word_list if word not in stop_list] pd.Series(word_list).value_counts().head(15) # In[181]: # what is the longest artist name? print max(artists_clean, key=len) # In[182]: # what is the distribution of lengths of artist names (number of characters)? name_lengths = pd.Series([len(artist) for artist in artists_clean.unique()]) name_lengths = name_lengths.value_counts().sort_index() name_lengths = name_lengths.iloc[0:51].reindex(range(51), fill_value=0) # In[183]: ax = name_lengths.plot(kind='bar', figsize=(10,6), alpha=0.6, width=1, color='#990066', edgecolor='#990066') xlabels = [x if x % 10 == 0 else '' for x in name_lengths.index] ax.set_xticklabels(xlabels, rotation=0, rotation_mode='anchor', ha='center', fontproperties=ticks_font) ax.yaxis.grid(True) ax.set_title('Frequency of artist name length', fontproperties=title_font) ax.set_xlabel('Number of characters in artist name', fontproperties=label_font) ax.set_ylabel('Number of artists', fontproperties=label_font) plt.savefig('images/lastfm-artists-name-length.png', dpi=96, bbox_inches='tight') plt.show() # In[ ]: