%matplotlib inline import IPython.display as ipd import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import MultiLabelBinarizer import utils sns.set_context("notebook", font_scale=1.5) plt.rcParams['figure.figsize'] = (17, 5) tracks = utils.load('data/fma_metadata/tracks.csv') genres = utils.load('data/fma_metadata/genres.csv') features = utils.load('data/fma_metadata/features.csv') echonest = utils.load('data/fma_metadata/echonest.csv') np.testing.assert_array_equal(features.index, tracks.index) assert echonest.index.isin(tracks.index).all() tracks.shape, genres.shape, features.shape, echonest.shape print('{} tracks, {} artists, {} albums, {} genres'.format( len(tracks), len(tracks['artist', 'id'].unique()), len(tracks['album', 'id'].unique()), sum(genres['#tracks'] > 0))) mean_duration = tracks['track', 'duration'].mean() print('track duration: {:.0f} days total, {:.0f} seconds average'.format( sum(tracks['track', 'duration']) / 3600 / 24, mean_duration)) dimensionality = mean_duration * 44000 * 2 print('sample dimensionality: {:.1e}'.format(dimensionality)) print('total size, i.e. number of audio samples: {:.1e}'.format(dimensionality * len(tracks))) for subset in tracks['set', 'subset'].unique(): indicator = tracks['set', 'subset'] <= subset print('{:6} {:6} tracks {:.1f} days'.format( subset, sum(indicator), sum(indicator) * 30 / 3600 / 24)) print('{} deleted tracks (largest track_id is {})'.format(tracks.index.max() - len(tracks), tracks.index.max())) print('First track: {}'.format(tracks['track', 'date_created'].min())) d = pd.DataFrame(tracks.index, index=tracks['track', 'date_created'].values) d['indicator'] = 1 fig, ax1 = plt.subplots() ax2 = ax1.twinx() d['track_id'].plot(ax=ax1) d['indicator'].cumsum().plot(ax=ax1) ax1.set_ylabel('#tracks') ax1.set_ylim(0, 160000) (d['indicator'] * -100).plot(ax=ax2, style='r') # needed for no apparent reason color = sns.color_palette('deep', 3)[2] d['indicator'].resample('2M').sum().fillna(0).plot(ax=ax2, style='--', color=color) ax2.set_ylabel('#tracks added') ax2.set_ylim(500, 4500) ax2.set_ylim(0, 4000) ax2.grid(False) lns = ax1.get_lines() + [ax2.get_lines()[1]] ax1.legend(lns, ['largest track id', '#tracks still present', '#tracks added per 2 months'], loc='lower right') plt.savefig('figures/growth.pdf') SPLITS = ['training', 'validation', 'test'] SUBSETS = ['small', 'medium', 'large'] print('subset #train #val #test val_ratio test_ratio') for subset in SUBSETS: counts = [sum((tracks['set', 'split'] == split) & (tracks['set', 'subset'] <= subset)) for split in SPLITS] ratios = np.array(counts[0] / counts[1:]) print('{:8s} {:7d} {:7d} {:7d} {:8.2f} {:9.2f}'.format(subset, *counts, *ratios)) for subset in ['small', 'medium']: subset = tracks['set', 'subset'] <= subset d = genres.reset_index().set_index('title') d = d.loc[tracks.loc[subset, ('track', 'genre_top')].unique()] for split in SPLITS: b = tracks['set', 'split'] == split d['#' + split] = tracks.loc[subset & b, ('track', 'genre_top')].value_counts() d['val_ratio'] = d['#training'] / d['#validation'] d['test_ratio'] = d['#training'] / d['#test'] ipd.display(d.sort_values('#training', ascending=False)) d = pd.DataFrame(index=genres.index, columns=SPLITS) for genre in genres.index: b = tracks['track', 'genres_all'].map(lambda genres: genre in genres) d.loc[genre] = tracks.loc[b, ('set', 'split')].value_counts() d['val_ratio'] = d['training'] / d['validation'] d['test_ratio'] = d['training'] / d['test'] d.sort_values('training', ascending=False, inplace=True) ipd.display(d.head(10)) ipd.display(d.tail(10)) def isnull(column, df=tracks): if column[1] in ['tags', 'genres', 'genres_all']: return df[column].apply(lambda x: len(x) == 0) elif df.dtypes[column] == np.int: return df[column] <= 0 else: return df[column].isnull() def count(series): col0 = series.name[0] df = tracks if col0 == 'track' else tracks.drop_duplicates((col0, 'id')) n = (~isnull(series.name, df)).sum() p = n / len(df) * 100 return n, p # Columns / metadata usage across dataset. d = pd.DataFrame(index=tracks.columns.drop('set'), columns=['n', 'p']) d = d.apply(count, axis=1) d['n'] = d['n'].astype(np.int) d # Excerpt as example in the paper. columns = [ ('track', 'title'), ('track', 'genres_all'), ('track', 'genre_top'), ('track', 'duration'), ('track', 'listens'), ('album', 'title'), ('album', 'listens'), ('album', 'tags'), ('artist', 'name'), ('artist', 'location'), ] non_null = ~isnull(columns[0]) for column in columns[1:]: non_null &= ~isnull(column) tids = np.random.RandomState(42).permutation(tracks.index[non_null])[:8] tracks.loc[tids, columns].head() #tracks.loc[tids, columns].to_latex('figures/tracks.tex', formatters={ # ('artist', 'longitude'): '{:,.1f}'.format, # ('artist', 'latitude'): '{:,.1f}'.format, #}) tracks['track', 'license'].value_counts().head(10) tracks['track', 'language_code'].value_counts().head(10) durations = tracks['track', 'duration'] plt.figure(figsize=(10, 4)) # Poster: (7, 3) p = sns.distplot(durations[durations.values < 800], kde=False, rug=False, color='k', hist_kws=dict(alpha=0.4)) p.set_xlabel('duration [seconds]') p.set_ylabel('#tracks') p.set_xlim(0, 800) # Poster: 500 plt.tight_layout() plt.savefig('figures/duration_distribution.pdf') durations.describe() # Uncommon bit rates are VBR encodings. print('Common bit rates: {}'.format(tracks['track', 'bit_rate'].value_counts().head(5).index.tolist())) print('Average bit rate: {:.0f} kbit/s'.format(tracks['track', 'bit_rate'].mean()/1000)) p = sns.distplot(tracks['track', 'bit_rate'], kde=False, rug=False) p.set_xlabel('bit rate') p.set_ylabel('#tracks'); # Tags. d1 = tracks['track', 'tags'].apply(len) d2 = tracks.drop_duplicates(('album', 'id')) d2 = d2['album', 'tags'].apply(len) d3 = tracks.drop_duplicates(('artist', 'id')) d3 = d3['artist', 'tags'].apply(len) - 1 labels = ['track', 'album', 'artist'] for l, d in zip(labels, [d1, d2, d3]): print('{}: from {} to {} tags'.format(l, max(d.min(), 0), d.max())) MAX = 13 # Poster: 11 fig, ax1 = plt.subplots(figsize=(10, 4)) # Poster: (7, 3) ax2 = ax1.twinx() ax1.hist(d1, bins=np.arange(MAX)+0.25, rwidth=0.2, color='C0', label=labels[0]) ax2.hist(d2, bins=np.arange(MAX)+0.50, rwidth=0.2, color='C1', label=labels[1]) ax2.hist(d3, bins=np.arange(MAX)+0.75, rwidth=0.2, color='C2', label=labels[2]) ax1.set_xlabel('#tags') ax1.set_ylabel('#tracks') ax2.set_ylabel('#artists / #albums') ax1.set_xlim(0.5, MAX-0.5) ax1.set_xticks(range(1, MAX)) ax1.set_ylim(0, 5000) ax2.set_ylim(0, 500) ax1.legend(loc='upper center') ax2.legend(loc='upper right') ax2.grid(False) fig.tight_layout() fig.savefig('figures/tag_distribution.pdf') # One artist tag is often the artist name. col = 'artist' d = tracks.drop_duplicates((col, 'id')) d.loc[d[col, 'tags'].apply(len) > 0, [('artist', 'name'), (col, 'tags')]].head() # Listens, favorites, comments. def plot(col0, col1, maxval, subplot=None): if col0 == 'track': d = tracks['track'] if col0 in ['artist', 'album']: d = tracks[col0].drop_duplicates('id') if subplot: plt.subplot(subplot) d = d[col1] p = sns.distplot(d[d.values < maxval], kde=False, color='k', hist_kws=dict(alpha=0.4)) p.set_xlim(-1, maxval) p.set_xlabel('#' + col1) p.set_ylabel('#' + col0 + 's') plt.figure(figsize=(17, 10)) plot('track', 'listens', 10e3, 221) plot('track', 'interest', 10e3, 222) plot('track', 'favorites', 100, 223) plot('track', 'comments', 20, 224) plt.figure(figsize=(17, 10)) plot('album', 'listens', 100e3, 221) plot('album', 'favorites', 100, 223) plot('album', 'comments', 20, 224) plt.figure(figsize=(17, 5)) plot('artist', 'favorites', 100, 121) plot('artist', 'comments', 20, 122) # Same as above, formated for the paper. plt.figure(figsize=(10, 4)) # Poster: (7, 3) plot('album', 'listens', 40e3) # Poster 20e3 plt.tight_layout() plt.savefig('figures/listens_distribution.pdf') tracks['album', 'listens'].max() # Most listened albums. tracks['album'].groupby('id').first().sort_values('listens', ascending=False).head(10) def plot(col0, col1): if col0 == 'track': d = tracks['track'] if col0 in ['artist', 'album']: d = tracks[col0].drop_duplicates('id') d = pd.Series(1, index=d[col1]) d.resample('A').sum().fillna(0).plot() plt.figure() plot('track', 'date_recorded') plot('album', 'date_released') plt.figure() plot('artist', 'active_year_begin') plot('artist', 'active_year_end') plt.figure() plot('track', 'date_created') plot('album', 'date_created') plot('artist', 'date_created') # Same as above, formated for the paper. plt.figure(figsize=(5, 4)) d = tracks['album'].drop_duplicates('id') d = pd.Series(1, index=d['date_released']) d = d.resample('A').sum().fillna(0) b = d.index >= pd.to_datetime(1990, format='%Y') b &= d.index <= pd.to_datetime(2017, format='%Y') d[b].plot(color='k') plt.xlabel('release year') plt.ylabel('#albums') plt.tight_layout() plt.savefig('figures/album_release_year.pdf') d.index.min().year, d.index.max().year for effect in ['artist', 'album']: d = tracks[effect, 'id'].value_counts() ipd.display(d.head(5)) p = sns.distplot(d[(d.values < 50) & (d.values >= 0)], kde=False) p.set_xlabel('#tracks per ' + effect); p.set_ylabel('#' + effect + 's'); counts = pd.Series(index=genres.loc[genres['parent'] == 0, 'title'].values, name='#artists') for genre in counts.index: counts[genre] = len(tracks.loc[tracks['track', 'genre_top'] == genre, ('artist', 'id')].unique()) counts.sort_values(ascending=False).plot.bar() plt.ylabel('#artists'); a = set(tracks['track', 'genre_top'].unique().dropna()) b = set(genres.loc[genres['top_level'].unique(), 'title'].values) assert a == b print('{} top-level genres'.format(len(a))) genres[genres['parent'] == 0].sort_values('#tracks', ascending=False) # Genres per track. labels = ['genres', 'genres_all'] #, 'genres_top'] d = [tracks['track', label].map(len) for label in labels] labels = ['{}\nmax: {}'.format(label, d1.max()) for label, d1 in zip(labels, d)] for l, d1 in zip(labels, d): print('{} per track: from {} to {} tags'.format(l, d1.min(), d1.max())) print('#tracks without genre: {}'.format((tracks['track', 'genres'].map(len) == 0).sum())) MAX = 9 fig, ax = plt.subplots(figsize=(5, 4)) ax.hist(d, bins=np.arange(MAX)-0.5, label=labels) ax.set_xlabel('#genres per track') ax.set_ylabel('#tracks') ax.set_xlim(-0.5, MAX-1.5) ax.set_xticks(range(MAX-1)) ax.set_yticklabels(['0'] + ['{}0k'.format(i) for i in range(1, 6)]) ax.legend(loc='upper right') fig.tight_layout() fig.savefig('figures/genres_per_track.pdf') # Number of tracks per genre (full). d = genres[genres['#tracks'] > 2000].sort_values('#tracks', ascending=False) # Poster: 5000 plt.figure(figsize=(10, 4)) # Poster: (7, 4) p = sns.barplot('title', '#tracks', data=d, color='k', alpha=0.4) p.set_xlabel('') p.set_ylabel('#tracks') plt.xticks(rotation=90) plt.tight_layout() plt.savefig('figures/genre_distribution.pdf') genres.loc[genres['#tracks'] > 0, '#tracks'].min(), genres['#tracks'].max() # Number of tracks per top-level genre (medium). d = tracks[tracks['set', 'subset'] <= 'medium'] d = d['track', 'genre_top'].value_counts() plt.figure(figsize=(10, 4)) # Poster: (7, 4) d.plot.bar(color='k', alpha=0.4) plt.ylabel('#tracks') plt.xlabel('') plt.tight_layout() plt.savefig('figures/genre_top_distribution.pdf') d g = utils.Genres(genres) graph = g.create_tree([25, 31], 1) ipd.Image(graph.create_png()) graph = g.create_tree(14) graph.write_pdf('figures/genre_hierarchy.pdf'); roots = g.find_roots() print('{} roots'.format(len(roots))) graph = g.create_tree(roots) graph.write_pdf('figures/genre_hierarchy.pdf'); enc = MultiLabelBinarizer() genres_indicator = enc.fit_transform(tracks['track', 'genres']) genres_names = enc.classes_ genres_names = genres.loc[enc.classes_, 'title'].values cross_correlation = genres_indicator.T @ genres_indicator np.fill_diagonal(cross_correlation, 0) plt.figure(figsize=(28, 28)) plt.imshow(np.log(cross_correlation)) plt.yticks(range(len(genres_names)), genres_names); plt.xticks(range(len(genres_names)), genres_names, rotation=90); cross_correlation = np.tril(cross_correlation, k=-1) sort = np.argsort(cross_correlation.flatten()) N = 20 indices = np.unravel_index(sort[:-N:-1], cross_correlation.shape) for i, j in zip(*indices): print('{}: {} | {}'.format(cross_correlation[i, j], genres_names[i], genres_names[j])) features.head(5).style.format('{:.2f}') sns.pairplot(features.loc[:, ('mfcc', 'mean', slice('01','03'))]); sns.pairplot(features.loc[:, ('mfcc', 'std', slice('01','03'))]); print('Echonest features available for {} tracks.'.format(len(echonest)))