import pandas as pd import numpy as np import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') import warnings warnings.filterwarnings('ignore') songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project/song_genre.csv') genres = pd.DataFrame(songs.genre.value_counts()) genre_list = ['rock','r&b','country','soul','hip hop','dance','rap','jazz','folk','disco','funk', 'reggae','latin','house','electronic','blues','metal'] song_genre = songs[songs['genre'].isin(genre_list)] song_genre = song_genre.drop_duplicates() song_genre['date'] = pd.to_datetime(song_genre['date'],format='%m/%d/%Y') genre_dummy = pd.get_dummies(song_genre['genre']) song_date = song_genre['date'] song_date = genre_dummy.join(song_date) song_group = song_date.groupby('date').agg('sum') song_group.plot(kind='area',figsize=(30,18),colormap ='Paired') plt.legend(loc=2,fontsize=15,ncol=3,markerscale=100) plt.suptitle('Billboard Top 40 by Genre (Jan 1960 - Sep 2014)',fontsize=35) plt.yticks(fontsize=20) plt.xticks(fontsize=25) plt.xlabel('date',fontsize=25) songs = pd.read_csv('C:\Users\Matt\SkyDrive\Documents\GA Data Science\Final Project\song_data.csv') songs.head() pd.scatter_matrix(songs[['popularity','danceability','duration','tempo','loudness','energy']],figsize=(20,20)) plt.suptitle('Scatterplot Matrix of Song Attributes',size=25) songs_1 = songs[songs['rank'] == 1] songs_1['date'] = pd.to_datetime(songs_1['date']) songs_1.plot(x='date',y='popularity',figsize=(25,8)) plt.suptitle('Time Series of Current Popularity of #1 Songs',size =25) plt.ylabel('Current Popularity') top_songs = pd.read_csv('C:\Users\Matt\Desktop\Billboard_Top_40\Data/top_rank.csv') songs2 = songs.copy() songs2 = songs2.drop(['date','rank'],axis=1) songs2 = songs2.drop_duplicates() top_songs = pd.merge(songs2,top_songs,on='uri') top_songs['date'] = pd.to_datetime(top_songs['date']) top_songs['time'] = top_songs.date.max() - top_songs.date top_songs['days'] = (top_songs.time /np.timedelta64(1, 'D')).astype(int) from sklearn import linear_model clf = linear_model.LinearRegression() features = ['danceability','duration','energy','instrumentalness','key','liveness','loudness','tempo','time_signature','days'] X = top_songs[features] X = X.values y = top_songs['popularity'] y = y.values from sklearn.cross_validation import train_test_split xtrain,xtest,ytrain,ytest = train_test_split(X, y) model = clf.fit(xtrain,ytrain) print 'Train Score: ', model.score(xtrain,ytrain) print 'Test Score: ', model.score(xtest,ytest) print pd.DataFrame(zip(features,model.coef_.T),columns=['Variable','Coefficient']) from sklearn import feature_selection f = feature_selection.f_regression(X,y) print print pd.DataFrame(zip(features,f[1].T),columns=['Variable','P-Value']) plt.scatter(x=top_songs.days,y=top_songs.loudness) plt.ylabel('loudness (decibels)',size=10) plt.xlabel('Elapsed Days (Max Top 40 Ranking to 9/2014',size=10) plt.suptitle('Loudness vs Days Elapsed') plt.xticks(size=8) plt.yticks(size=8) newer_feature = ['days'] X = top_songs[newer_feature] X = X.values y = top_songs['popularity'] y = y.values model = clf.fit(xtrain,ytrain) print 'Train Score: ', model.score(xtrain,ytrain) print 'Test Score: ', model.score(xtest,ytest) top_songs['log_speech'] = np.log(top_songs['speechiness']) top_songs['log_speech'] = top_songs['log_speech'] - top_songs['log_speech'].min() top_songs['log_instrument'] = np.log(top_songs['instrumentalness']) top_songs['log_instrument'] = top_songs['log_instrument'] - top_songs['log_instrument'].min() top_songs['loudness'] = top_songs['loudness'] - top_songs['loudness'].min() top_songs['med_pop'] = (top_songs['popularity'] >= top_songs['popularity'].median()).astype('int') top_songs = top_songs.dropna() features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','log_instrument','log_speech'] X = top_songs[features] X = X.values y = top_songs['med_pop'] y = y.values xtrain,xtest,ytrain,ytest = train_test_split(X, y) log = linear_model.LogisticRegression() model = log.fit(xtrain,ytrain) print 'Train Score: ', model.score(xtrain,ytrain) print 'Test Score: ', model.score(xtest,ytest) plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = top_songs.boxplot(features[i],by='med_pop',ax=subplot(6,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Feature Boxplots by Median Quantile',size=20) def quartile(pop): if pop < .25: quartile = 1 return quartile elif .25 <= pop < .5: quartile = 2 return quartile elif .5 <= pop < .75: quartile = 3 return quartile else: quartile = 4 return quartile top_songs['quart_pop'] = top_songs['popularity'].apply(quartile).astype('int') features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','log_instrument','log_speech'] X = top_songs[features] X = X.values y = top_songs['quart_pop'] y = y.values xtrain,xtest,ytrain,ytest = train_test_split(X, y) model = log.fit(xtrain,ytrain) print 'Train Score: ', model.score(xtrain,ytrain) print 'Test Score: ', model.score(xtest,ytest) plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = top_songs.boxplot(features[i],by='quart_pop',ax=subplot(6,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Feature Boxplots by Quartile',size=20) song_genre['log_speech'] = np.log(song_genre['speechiness']) song_genre['log_speech'] = song_genre['log_speech'] - song_genre['log_speech'].min() song_genre['log_instrument'] = np.log(song_genre['instrumentalness']) song_genre['log_instrument'] = song_genre['log_instrument'] - song_genre['log_instrument'].min() rock = song_genre[song_genre['genre']=='rock'] def decade(date): year = str(date).split('-')[0] last_num = year[3] decade = int(year) - int(last_num) return decade rock['decade'] = rock['date'].apply(decade) rock = rock.drop(['rank','date'],axis=1) rock = rock.drop_duplicates() rock = rock.reset_index(drop=True) rock.head() features = ['danceability','duration','energy','key','liveness','loudness','tempo','time_signature','log_instrument','log_speech'] rock = rock.dropna() plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = subplot(5,2,v) ax1.hist(rock[features[i]]) ax1.set_title(str(features[i]),fontsize=15) plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = rock.boxplot(features[i],by='decade',ax=subplot(5,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Rock Feature Boxplots by Decade',size=20) rock = rock.dropna() rock['loudness'] = rock['loudness']-rock['loudness'].min() X = rock[features] Y = rock['decade'] xtrain,xtest,ytrain,ytest = train_test_split(X,Y) plt.hist(ytrain) plt.xlabel('Song Decade', size=12) plt.suptitle('Distribution of Songs in the Training Set') from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain) print 'Train: ', rf.score(xtrain,ytrain) print 'Test: ', rf.score(xtest, ytest) from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix y_pred = rf.predict(xtest) decades = ['1960','1970','1980','1990','2000','2010'] matrix = pd.DataFrame(confusion_matrix(ytest,y_pred),columns=decades,index=decades) print 'Confusion Matrix' print print matrix, '\n' print 'Classification Report' print print classification_report(ytest,y_pred,target_names=decades),"\n" genre = ['r&b','hip hop','country'] genre_subset = song_genre[song_genre['genre'].isin(genre)] genre_subset = genre_subset.drop(['date','rank'],axis=1) genre_subset = genre_subset.drop_duplicates() plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Feature Boxplots by Genre',size=20) def genre_num(lst): index = genre.index(lst) return index genre_subset['genre_num'] = genre_subset['genre'].apply(genre_num) genre_subset = genre_subset.dropna() X = genre_subset[features] Y = genre_subset['genre_num'] xtrain,xtest,ytrain,ytest = train_test_split(X,Y) rf = RandomForestClassifier(n_estimators=800).fit(xtrain,ytrain) print 'Train: ', rf.score(xtrain,ytrain) print 'Test: ', rf.score(xtest, ytest) y_pred = rf.predict(xtest) decades = ['1960','1970','1980','1990','2000','2010'] matrix = pd.DataFrame(confusion_matrix(ytest,y_pred),columns=genre,index=genre) print 'Confusion Matrix' print print matrix, '\n' print 'Classification Report' print print classification_report(ytest,y_pred,target_names=genre),"\n" genre = ['r&b','hip hop','country','rock'] features = ['danceability','energy','liveness','time_signature','log_speech'] genre_subset = song_genre[song_genre['genre'].isin(genre)] genre_subset = genre_subset.drop(['date','rank'],axis=1) genre_subset = genre_subset.drop_duplicates() plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = genre_subset.boxplot(features[i],by='genre',ax=subplot(5,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Feature Boxplots by Genre',size=20) for i in genre: cluster_genre = genre_subset[genre_subset['genre']==i] print 'Top 10 Songs for Genre: %s' %i print print cluster_genre[['title','artist','popularity','genre']].sort('popularity',ascending=False).head(10) print from sklearn.cluster import KMeans features = ['danceability','energy','liveness','time_signature','log_speech'] genre_subset = genre_subset.dropna() X = genre_subset[features] km = KMeans(n_clusters=len(genre)).fit(X) genre_subset['prediction'] = km.predict(genre_subset[features]) plt.figure(figsize=(20,20)) for i in range(len(features)): v = i +1 ax1 = genre_subset.boxplot(features[i],by='prediction',ax=subplot(5,2,v)) ax1.set_title(str(features[i]),fontsize=15) ax1.set_xlabel('') plt.suptitle('Feature Boxplots by Clustered Genre',size=20) from IPython.display import HTML genre_dict ={} for i in sort(genre_subset['prediction'].unique()): cluster_genre = genre_subset[genre_subset['prediction']==i] cluster_genre = cluster_genre.sort('popularity',ascending=False).reset_index() print 'Top 10 Songs for Cluster Genre %s' %i print print cluster_genre[['title','artist','popularity','genre']].head(10) print genre_dict[i] = list(cluster_genre['uri'][0:20]) genre_dict[i]= ','.join(genre_dict[i]) genre_dict[i] = '' % (i, genre_dict[i]) HTML(genre_dict[0]) HTML(genre_dict[1]) HTML(genre_dict[2]) HTML(genre_dict[3])