#!/usr/bin/env python # coding: utf-8 # # Exploring the review scores data # # There's a lot going on in this dataset. This notebook follows my intuitions in an attempt to get a sense of the data. # In[1]: import sqlite3 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.signal import savgol_filter pd.set_option('precision', 2) np.set_printoptions(precision=2) con = sqlite3.connect('../pitchfork.db') reviews = pd.read_sql('SELECT * FROM reviews', con) genres = pd.read_sql('SELECT * FROM genres', con) con.close() # ## Score and Best New Music distributions # In[2]: print('\nAverages:') print(np.mean(reviews[['best_new_music', 'score']])) print('\nStandard Deviation:') print(np.std(reviews[['best_new_music', 'score']])) g = reviews.groupby('score') info = g['best_new_music'].agg(['sum','count']).reset_index() plt.plot(info['score'], savgol_filter(info['count'], 5, 1), label = 'All Reviews') plt.plot(info['score'], savgol_filter(info['sum'], 5, 1), label = "Best New Music") plt.legend(loc = 'best') plt.xlabel('Score') plt.ylabel('Frequency') plt.show() # Scores are roughly normally distributed, a little negative skew. The average release gets a score of about 7.0, and about 5% of releases are best new music, though this figure is artificially low because I included released from before best new music was around. The distribution declines pretty sharply at about 8.0, no wonder Pitchfork has [a special page for 8.0+ reviews](http://pitchfork.com/best/high-scoring-albums/)... # # There is also effectively no best new music before 8.0. And most releases above 8.5 are categorized as best new music. Makes you kind of feel bad for all the 8.3s and 8.4s that narrowly missed the cutoff... # # *Sanity check*: what are the 10.0 albums that are *not* best new music? # In[3]: idx = (reviews.best_new_music == 0) & (reviews.score == 10.0) reviews.loc[idx, ['artist', 'title', 'pub_date'] ] # All of these are re-issued classics that somehow missed the "best new reissue" label, or highly-rated albums reviewed prior to the best new music label. # ## Ratings by Genre # In[4]: genre_data = pd.merge(reviews[['reviewid','score']], genres, on = 'reviewid') g = genre_data.groupby('genre') table = g['score'].agg(['count', 'mean', 'std']).reset_index() # plot the average at each level of count avgline = table.groupby('count')['mean'].mean().reset_index() avgline['mean'] = savgol_filter(avgline['mean'], 5, 1) plt.plot(avgline['count'], avgline['mean'],'k--') plt.plot(table['count'],table['mean'],'o', alpha = 1) for j, row in table.iterrows(): curr_avg = float(avgline.loc[avgline['count'] == row['count'], 'mean']) jitter = np.random.uniform(0.1, high = 0.5) if row['mean'] < curr_avg: jitter*= -1.0 plt.plot([row['count'], row['count']], [row['mean'], row['mean'] + jitter], 'k-', alpha = 0.1) plt.text(row['count'], row['mean'] + jitter, row['genre'], ha = 'center', va = 'center') plt.ylabel('Average Score') plt.xlabel('Number of Reviews') plt.ylim([5, 10]) plt.show() # ## Ratings by Author # In[5]: g = reviews.groupby('author') table = g.score.agg(('mean','std','count')) table['ratio'] = table['mean'] / table['count'] # remove labels with only a handful of reviews table = table.loc[table['count'] > 15] # plot the average at each level of count avgline = table.groupby('count')['mean'].mean().reset_index() avgline['mean'] = savgol_filter(avgline['mean'], 5, 1) plt.plot(avgline['count'], avgline['mean'],'k--') # plot each author as a point plt.plot(table['count'], table['mean'],'o', alpha = 0.5) # identify some standouts items = [ table['mean'].idxmax(), table['mean'].idxmin(), table['count'].idxmax() ] for idx in items: x, y = table.loc[idx, 'count'], table.loc[idx, 'mean'] curr_avg = float(avgline.loc[avgline['count'] == x, 'mean']) jitter = np.random.uniform(0.1, high = 0.5) if y < curr_avg: jitter*= -1.0 plt.plot([x, x], [y, y + jitter], 'k-', alpha = 0.1) plt.text(x, y + jitter, idx, ha = 'center', va = 'center') plt.ylabel('Average Score') plt.xlabel('Number of Reviews') plt.ylim([5, 10]) plt.show()