#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', 'inline') import seaborn as sns import pandas as pd import statsmodels.api as sm figsize(10,6) plt.rcParams['figure.dpi'] = 300 plt.style.use('ggplot') #plt.rcParams['font.family'] = 'Myriad Pro' plt.rcParams['text.color'] = '#555555' # In[2]: # Load data df = pd.read_csv('games_Jan2018.csv') df.set_index('id', inplace=True) df.type.value_counts() # Let's filter out expansions, and focus our analysis on base games df = df[df.type == 'boardgame'] # A little bit of data cleaning to set zeros to NAs where they should be NAs df[['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']] = \ df[['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']].replace(0,pd.np.nan) enough_ratings = df.users_rated >= 150 enough_weights = df.total_weights >= 150 enough_ratings.sum(), enough_weights.sum() # In[ ]: # In[4]: plt.style.use('ggplot') #plt.rcParams['font.family'] = 'Myriad Pro' plt.rcParams['text.color'] = '#555555' # # Load data # In[5]: df = pd.read_csv('games_Jan2018.csv') df.set_index('id', inplace=True) # In[6]: df.type.value_counts() # In[7]: # Let's filter out expansions, and focus our analysis on base games df = df[df.type == 'boardgame'] # In[8]: # A little bit of data cleaning to set zeros to NAs where they should be NAs df[['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']] = \ df[['yearpublished', 'minplayers', 'maxplayers', 'playingtime', 'minplaytime', 'maxplaytime', 'minage']].replace(0,pd.np.nan) # In[9]: enough_ratings = df.users_rated >= 150 enough_weights = df.total_weights >= 150 enough_ratings.sum(), enough_weights.sum() # # Complexity / weight # BGG allows users to rate the "weight" or complexity of a game. # This is a single measure for each game from a scale from 1-5 and it doesn't decouple the different types of complexity # (e.g. Chess that's relatively easy in terms of rules, but can be quite complex in terms of how to use those rules to play as opposed to Terra Mystica, where it take a while to learn all the rules, but once you've got the hang of the rules, the gameplay # is relatively straight forward) # # N.B. There's probably elegant terminology for these different types of complexity. Colm may know the answer. # # # Let's find out how reliable this information is by running some rodimentary tests against some simple intuition-based proxies for complexity # In[10]: # Testing weight rating against minimum age sns.regplot(x='minage', y='average_weight', data=df[df.total_weights>100], x_jitter=.3, scatter_kws={'s':6}) # Min age is a loose proxy for complexity, but it can also be filtered due to content type such as very # explicit material e.g. Cards against Humanity) # In[12]: cax = sns.regplot(x='average_weight', y='average_rating', data=df[enough_weights], scatter_kws={'alpha':0.8, 's':10, 'color':None, 'cmap':'seismic_r',}, line_kws={'lw':1, 'ls':':', 'color':'k'}) plt.gca().annotate('Monopoly', xy=(1.685, 4.42057), xytext=(1.5,3), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Sid Meier's Civilization: The Boardgame", xy=(3.6454, 5.59183), xytext=(3,4.5), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Risk", xy=(2.1072, 5.57929), xytext=(2,4), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Gloomhaven", xy=(3.78, 9.00657), xytext=(3,9.1), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Kingdom Death: Monster", xy=(4.186, 8.97231), xytext=(3.2,8.7), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Pandemic Legacy", xy=(2.8026, 8.66878), xytext=(2.2,9.0), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Codenames", xy=(1.3535, 7.90691), xytext=(1.0,8.5), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("7 Wonders: Duel", xy=(2.2463, 8.19443), xytext=(1.5,9.1), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Patchwork", xy=(1.7131, 7.83136), xytext=(1.2,8.8), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Blackbeard", xy=(3.3218, 6.09174), xytext=(3,5), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.gca().annotate("Rise and Decline of the Third Reich", xy=(4.2945, 6.79503), xytext=(3.5,6), arrowprops=dict(facecolor='black', arrowstyle="->",edgecolor='k')) plt.ylabel('Average rating') plt.xlabel('Average weight score (complexity)') plt.title('Rating vs complexity') #ax1 = plt.gcf().add_axes([0.91, 0.125, 0.01, 0.75]) #norm = mpl.colors.Normalize(vmin=3, vmax=5) #cb1 = mpl.colorbar.ColorbarBase(ax1, cmap='seismic_r', # norm=norm, # orientation='vertical') #cb1.set_ticks([pd.np.arange(3,6,1)]) #cb1.set_ticklabels(['1,000','10,000','100,000']) #cb1.set_label('Number of owners') plt.gcf().set_size_inches(10,6) # In[13]: pd.np.polyfit(df[enough_weights & (df.average_weight>1.5)].average_weight, df[enough_weights & (df.average_weight>1.5)].average_rating,1) # In[14]: df['rating_residual'] = df.average_rating - (df.average_weight * 0.48932139 + 5.80326323) df['corrected_rating'] = df['rating_residual'] + df[enough_ratings].average_rating.mean() df['bayes_corrected_rating'] = (df.users_rated *df['corrected_rating'] + 5.5*1000) / (df.users_rated + 1000) df['BGG_rank'] = df.bayes_average_rating.replace(0.000, pd.np.nan).rank(method='min', ascending=False) df['corrected_BGG_rank'] = df[df.users_rated >=30].bayes_corrected_rating.replace(0.00,pd.np.nan).rank(method='min', ascending=False) df['rating_change'] = df['bayes_corrected_rating'] - df['bayes_average_rating'] df['rank_change'] = df['BGG_rank'] - df['corrected_BGG_rank'] # In[63]: df.to_csv('complexity_corrected_data.csv') # In[15]: # Let's get the top games after correcting for the complexity bias df.sort_values('bayes_corrected_rating', ascending=False).head(100)#.to_clipboard() # In[16]: # Biggest winners df[(df.BGG_rank <= 100) | (df.corrected_BGG_rank <= 100)].sort_values('rank_change', ascending=False) # In[17]: # Biggest losers df[(df.BGG_rank <= 100) | (df.corrected_BGG_rank <= 100)].sort_values('rank_change', ascending=True).head(50)#.to_clipboard() # In[ ]: # In[ ]: # ### Rendering animated graph # In[18]: get_ipython().run_line_magic('matplotlib', 'inline') # In[19]: plt.rcParams['figure.dpi'] = 300 # In[20]: import matplotlib.animation as manimation FFMpegWriter = manimation.writers['ffmpeg'] metadata = dict(title='Movie Test', artist='Matplotlib', comment='Movie support!') fps=25 writer = FFMpegWriter(fps=fps, metadata=metadata) fig = plt.figure(figsize=(10,6), dpi=100) def make_frame(t): plt.cla() if t < 1: w=0.0 elif t < 3: w=(t-1)/2.0 else: w=1.0 sns.regplot(x=df[enough_weights].average_weight , y= (1-w) * df[enough_weights].average_rating + w * df[enough_weights].corrected_rating, ci=None, scatter_kws={'alpha':0.8, 's':10}) plt.ylabel('Average rating') plt.xlabel('Average weight score (complexity)') return (None) with writer.saving(fig, "manual_animtion.mp4", 150): for t in pd.np.arange(0,4+1./fps,1./fps): make_frame(t) plt.ylim(2, 9.5) writer.grab_frame() # In[21]: # Convert the mp4 to gif using ffmpeg # the code below does that, including a gif palette optimization # #!/bin/sh # # palette="/tmp/palette.png" # # filters="fps=25,scale=750:-1:flags=lanczos" # # ffmpeg -v warning -i manual_animtion.mp4 -vf "$filters,palettegen" -y $palette # ffmpeg -v warning -i manual_animtion.mp4 -i $palette -lavfi "$filters [x]; [x][1:v] paletteuse" -y manual_animtion.gif # In[ ]: # ### Rendering interactive complexity vs rating plot # In[22]: # Iinteractive plot using Altair import altair as alt #from vega_datasets import data alt.renderers.enable('notebook') #iris = data.iris() relevant_columns=['average_weight', 'average_rating', 'corrected_rating', 'name'] before_correction = alt.Chart(df[enough_weights][relevant_columns]).mark_point(filled=True).encode( alt.X('average_weight', axis=alt.Axis(title='Complexity score'), scale=alt.Scale(domain=(.8, 5))), alt.Y('average_rating', axis=alt.Axis(title='Game rating')), color=alt.value("#E24A33"), tooltip='name' ).properties(width=500, title='Before complexity-bias correction') after_correction = alt.Chart(df[enough_weights][relevant_columns]).mark_point(filled=True).encode( alt.X('average_weight', axis=alt.Axis(title='Complexity score'), scale=alt.Scale(domain=(.8, 5))), alt.Y('corrected_rating', axis=alt.Axis(title='Complexity-bias-corrected game rating')), color=alt.value("#E24A33"), tooltip='name' ).properties(width=500, title='After complexity-bias correction') alt.HConcatChart([before_correction, after_correction])#.save('interactive_chart.html') # In[29]: # Altair's API is pretty slick, but it doesn't seem flexible enough to accommodate what I'm trying to do # There's no intuitive way to get multiline hover tooltips # In[30]: # Let's generate it using Bokeh instead... # In[27]: from bokeh.plotting import figure, output_file, output_notebook, show, ColumnDataSource from bokeh.models import HoverTool #output_file("toolbar.html") output_notebook() source = ColumnDataSource(data=df[enough_weights][['name', 'average_weight', 'average_rating', 'corrected_rating', 'BGG_rank', 'corrected_BGG_rank']]) hover = HoverTool(tooltips=[ ("Name", "@name"), ("Average weight (complexity)", "@average_weight"), ("Original rating", "@average_rating"), ("Corrected rating", "@corrected_rating"), ("Original rank", "@BGG_rank"), ("Corrected rank", "@corrected_BGG_rank"), ]) p = figure(plot_width=800, plot_height=450, tools=[hover], title="Game ratings after complexity-bias correction") p.circle('average_weight', 'corrected_rating', size=4, source=source, color='#E24A33') p.title.align = 'center' p.title.text_font = 'Open Sans' p.xaxis.axis_label = 'Average weight score (Complexity)' p.xaxis.axis_label_text_font_style = "normal" p.xaxis.axis_label_text_font = 'Open Sans' p.yaxis.axis_label = 'Average rating' p.yaxis.axis_label_text_font_style = "normal" p.yaxis.axis_label_text_font = 'Open Sans' show(p) # In[ ]: