#!/usr/bin/env python # coding: utf-8 # # Reshaping & Tidy Data # # > Structuring datasets to facilitate analysis [(Wickham 2014)](http://www.jstatsoft.org/v59/i10/paper) # # So, you've sat down to analyze a new dataset. # What do you do first? # # In episode 11 of [Not So Standard Deviations](https://www.patreon.com/NSSDeviations?ty=h), Hilary and Roger discussed their approaches. # I'm with Hilary on this one, you should make sure your data is tidy. # Before you do any plots, filtering, transformations, summary statistics, regressions... # Once you get to a tidy dataset, it's relatively easy to do all of those. # # Today we'll talk about what makes a dataset tidy and how to get there. # Hadley Whickham (who I think is tied with Wes for number of shoutouts in this series...) kindly summarized a tidyness as a dataset where # # 1. Each variable forms a column # 2. Each observation forms a row # 3. Each type of observational unit forms a table # # And most of the time it's just the first two that matter. # As quoted at the top, this really is about facilitating analysis: going as quickly as possible from question to answser. # # We'll cover a few methods that help you get you there. # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt pd.options.display.max_rows = 10 sns.set(style='ticks', context='talk') get_ipython().run_line_magic('matplotlib', 'inline') # # NBA Data # *Note: I'm reusing this example from an [earlier post](tidy-data.html).* # # Based on [this](http://stackoverflow.com/questions/22695680/python-pandas-timedelta-specific-rows) StackOverflow question. # We'll grab some NBA game data from basketball-reference.com using pandas' `read_html` function, which returns a list of DataFrames. # In[7]: tables = pd.read_html("http://www.basketball-reference.com/leagues/NBA_2016_games.html") games = tables[0] games.head() # Side note: pandas' `read_html` is pretty good. On simple websites it almost always works. It provides a couple parameters for controlling what gets selected from the webpage if the defaults fail. I'll always use it first, before moving on to BeautifulSoup or lxml if the page is more complicated. # As you can see, we have a bit of general munging to do before tidying. # Each month slips in an extra row of mostly NaNs, the column names aren't too useful, and we have some dtypes to fix up. # In[8]: column_names = {'Date': 'date', 'Start (ET)': 'start', 'Unamed: 2': 'box', 'Visitor/Neutral': 'away_team', 'PTS': 'away_points', 'Home/Neutral': 'home_team', 'PTS.1': 'home_points', 'Unamed: 7': 'n_ot'} games = (games.rename(columns=column_names) .dropna(thresh=4) [['date', 'away_team', 'away_points', 'home_team', 'home_points']] .assign(date=lambda x: pd.to_datetime(x['date'], format='%a, %b %d, %Y')) .set_index('date', append=True) .rename_axis(["game_id", "date"]) .sort_index()) games.head() # A quick aside on that last block. # # - `dropna` has a `thresh`old argument. If at least `thresh` items are missing, the row is dropped. We used it to remove the "Month headers" that slipped into the table # - `assign` can take a callable. This lets us refer to the column in the previous step of the chain. Otherwise we would have to assign `temp_df = games.dropna()...` And then do the `pd.to_datetime` on that. # - `set_index` has an `append` keyword. We keep the original index around since it will be our unique identifier per game # - We use `.rename_axis` to set the index names (this behavior is new in pandas 0.18; before `.rename_axis` only took a mapping for changine labels) # The Question: # > **How many days of rest did each team get between each game?** # # Whether or not your dataset is tidy depends on your question. Given our question, what is an observation? # # In this case, an observation is a `(team, game)` pair. Unfortunately we don't have that yet. We have two observations per row, one for home and one for away. We'll fix that with `pd.melt`. # # `melt` works by taking observations that are spread across columns (`away_team`, `home_team`), and melting them down into one column with multiple rows. However, we don't want to lose the metadata (like `game_id` and `date`) that is shared between the observations. By including those columns as `id_vars`, the values will be repeated as many times as needed to stay with their observations. # In[9]: tidy = pd.melt(games.reset_index(), id_vars=['game_id', 'date'], value_vars=['away_team', 'home_team'], value_name='team') tidy.head() # The DataFrame `tidy` meets our rules for tidyness: each variable is in a column, and each observation (`team`, `date` pair) is on its own row. # Now the translation from question ("How many days of rest") to operation ("date of today's game - date of previous game - 1") is direct: # In[12]: # For each team... get number of days between games tidy.groupby('team')['date'].diff().dt.days - 1 # That's the essence of tidy data, the reason why it's worth considering what shape your data should be in. # It's about setting yourself up for success, so that the answers naturally flow from the (just kidding, it's still difficult. But hopefully less so). # # Let's assign that back into our DataFrame # In[13]: tidy['rest'] = tidy.sort_values('date').groupby('team').date.diff().dt.days - 1 tidy.dropna().head() # To show the inverse of `melt`, let's take `rest` values we just calculated and place them back in the original DataFrame with a `pivot_table`. # In[18]: by_game = (pd.pivot_table(tidy, values='rest', index=['game_id', 'date'], columns='variable') .rename(columns={'away_team': 'away_rest', 'home_team': 'home_rest'})) df = pd.concat([games, by_game], axis=1) df.dropna().head() # One somewhat subtle point: an "observation" depends on the question being asked. # # So really, we have two tidy datasets, `tidy` for answering team-level things, and `df` for game-level things. # # One potentially interesting question is "what was each team's average days of rest, at home and on the road". With a tidy dataset (the DataFrame `tidy`, since it's team-level), Seaborn makes this easy: # In[24]: g = sns.FacetGrid(tidy, col='team', col_wrap=6, hue='team') g.map(sns.barplot, 'variable', 'rest'); # An example of a game-level statistic is the distribution of rest differences in games: # In[27]: df['home_win'] = df['home_points'] > df['away_points'] df['rest_spread'] = df['home_rest'] - df['away_rest'] df.dropna().head() # In[28]: delta = (by_game.home_rest - by_game.away_rest).dropna().astype(int) ax = (delta.value_counts() .reindex(np.arange(delta.min(), delta.max() + 1), fill_value=0) .sort_index() .plot(kind='bar', color='k', width=.9, rot=0, figsize=(12, 6)) ) sns.despine() ax.set(xlabel='Difference in Rest (Home - Away)', ylabel='Games') plt.savefig('../content/images/tidy_diff_in_rest.svg', transparent=True) # Or the win percent by rest difference # In[29]: fig, ax = plt.subplots(figsize=(12, 6)) sns.barplot(x='rest_spread', y='home_win', data=df.query('-3 <= rest_spread <= 3'), color='#4c72b0', ax=ax) sns.despine() # # Stack / Unstack # # Pandas has two useful methods for quickly converting from wide to long format (`stack`) and long to wide (`unstack`). # In[67]: rest = (tidy.groupby(['date', 'variable']) .rest.mean() .dropna()) rest.head() # `rest` is in a "long" form since we have a single column of data, with multiple "columns" of metadata (in the MultiIndex). We use `.unstack` to move from long to wide. # In[68]: rest.unstack().head() # `unstack` moves a level of a MultiIndex (innermost by default) up to the columns. # `stack` is the inverse. # In[31]: rest.unstack().stack() # With `.unstack` you can move between those APIs that expect there data in long-format and those APIs that work with wide-format data. For example, `DataFrame.plot()`, works with wide-form data, one line per column. # In[35]: ax=(rest.unstack() .query('away_team < 7') .rolling(7) .mean().plot(figsize=(12, 6), linewidth=3, legend=False)) ax.annotate("Home", (rest.index[-1][0], 1.02), color='g', size=14) ax.annotate("Away", (rest.index[-1][0], 0.82), color='b', size=14) sns.despine() # The most convinent form will depend on exactly what you're doing. # When interacting with databases you'll often deal with long form data. # Pandas' `DataFrame.plot` often expects wide-form data, while `seaborn` often expect long-form data (more on that in a future post). Regressions will expect wide-form data. Either way, it's good to be comfortable with `stack` and `unstack` (and MultiIndexes) to quickly move between the two. # # Mini Project: Home Court Advantage? # # We've gone to all that work tidying our dataset, let's put it to use. # What's the effect (in terms of probability to win) of being # the home team? # ### Step 1: Create an outcome variable # # We need to create an indicator for whether the home team won. # Add it as a column called `home_win` in `games`. # In[37]: df['home_win'] = df.home_points > df.away_points # ### Step 2: Find the win percent for each team # # We'll use a similar `melt` operation as early, only now with the `home_win` variable we just created. # In[38]: wins = ( pd.melt(df.reset_index(), id_vars=['game_id', 'date', 'home_win'], value_name='team', var_name='is_home', value_vars=['home_team', 'away_team']) .assign(win=lambda x: x.home_win == (x.is_home == 'home_team')) .groupby(['team', 'is_home']) .win .agg({'n_wins': 'sum', 'n_games': 'count', 'win_pct': 'mean'}) ) wins.head() # Pause for visualiztion, because why not # In[71]: g = sns.FacetGrid(wins.reset_index(), hue='team', size=10, aspect=.5, palette=['k']) g.map(sns.pointplot, 'is_home', 'win_pct').set(ylim=(0, 1)); # (It'd be great if there was a library built on top of matplotlib that auto-labeled each point decently well. Apparently this is a difficult problem to do in general). # In[72]: g = sns.FacetGrid(wins.reset_index(), col='team', hue='team', col_wrap=5) g.map(sns.pointplot, 'is_home', 'win_pct'); # Those two graphs show that most teams have a higher win-percent at home than away. So we can continue to investigate. # Let's aggregate over home / away to get an overall win percent per team. # In[73]: win_percent = ( # Use sum(games) / sum(games) since I don't # know if teams play the same number of games at # home as away wins.groupby(level='team', as_index=True) .apply(lambda x: x.n_wins.sum() / x.n_games.sum()) ) win_percent.head() # In[74]: win_percent.sort_values().plot.barh(figsize=(6, 12), width=.85, color='k') plt.tight_layout() sns.despine() plt.xlabel("Win Percent"); # Is there a relationship between overall team strength and their home-court advantage? # In[76]: (wins.win_pct .unstack() .assign(**{'Home Win % - Away %': lambda x: x.home_team - x.away_team, 'Overall %': lambda x: (x.home_team + x.away_team) / 2}) .pipe((sns.regplot, 'data'), x='Overall %', y='Home Win % - Away %') ) sns.despine() # And get that back into `df`. # You could you `pd.merge`, but I prefer `.map` when joining a `Series`. # In[77]: df = df.assign(away_strength=df['away_team'].map(win_percent), home_strength=df['home_team'].map(win_percent), point_diff=df['home_points'] - df['away_points'], rest_diff=df['home_rest'] - df['away_rest']) df.head() # In[82]: import statsmodels.formula.api as sm df['home_win'] = df.home_win.astype(int) # for statsmodels # In[83]: mod = sm.logit('home_win ~ home_strength + away_strength + home_rest + away_rest', df) res = mod.fit() res.summary() # The strength variables both have large coefficeints (really we should be using some independent measure of team strength here, `win_percent` is showing up on the left and right side of the equation). The rest variables don't seem to matter as much. # # It's easy to explore variations in formula. # In[87]: (sm.Logit.from_formula('home_win ~ strength_diff + rest_spread', df.assign(strength_diff=df.home_strength - df.away_strength)) .fit().summary()) # In[88]: mod = sm.Logit.from_formula('home_win ~ home_rest + away_rest', df) res = mod.fit() res.summary()