#!/usr/bin/env python # coding: utf-8 # ## Pandas Crosstab Explained # # Notebook that accompanies [article](http://pbpython.com/pandas-crosstab.html) on Practical Business Python # In[1]: import pandas as pd import seaborn as sns # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') # ### Import the data # In[3]: # Define the headers since the data does not have any headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"] # In[4]: # Read in the CSV file and convert "?" to NaN df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data", header=None, names=headers, na_values="?" ) df_raw.head() # In[5]: # Take a quick look at all the values in the data df_raw.describe() # In[6]: # Filter out the top 8 manufacturers models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"] # In[7]: df = df_raw[df_raw.make.isin(models)].copy() # In[8]: df.head() # In[9]: ### Basic Crosstab functions # In[10]: # Create a simple crosstab that counts the number of occurences of each combination pd.crosstab(df.make, df.num_doors) # In[11]: # Add a subtotal pd.crosstab(df.make, df.num_doors, margins=True, margins_name="Total") # In[12]: # Another example, this time of make and body_style pd.crosstab(df.make, df.body_style) # In[13]: # Add custom names for the rows and columns? pd.crosstab(df.make, df.body_style, rownames=['Auto Manufacturer'], colnames=['Body Style']) # ### Normalizing Results # In[14]: # Convert the occurrences to percentages pd.crosstab(df.make, df.body_style, normalize=True) # In[15]: # Convert the occurrences to percentages for each row pd.crosstab(df.make, df.body_style, normalize='index') # In[16]: # Convert the occurrences to percentages for each column pd.crosstab(df.make, df.body_style, normalize='columns') # In[17]: # If you want to make the percentages a little easier to see, multiple all values by 100 pd.crosstab(df.make, df.body_style, normalize='columns').mul(100).round(0) # ### Custom Aggregations and Grouping # In[18]: # Perform aggregation functions - not just a simple count pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0) # In[19]: pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-') # In[20]: # Crosstab supports grouping as well. In this case, group the columns pd.crosstab(df.make, [df.body_style, df.drive_wheels]) # In[21]: # A more complex example showing the grouping of rows and columns pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels], rownames=['Auto Manufacturer', "Doors"], colnames=['Body Style', "Drive Type"], dropna=False) # In[22]: # You can also use agg functions when grouping pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-') # In[23]: # You can also use margins when grouping pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean', margins=True, margins_name='Average').fillna('-').round(0) # ### Visualizing results with Seaborn # In[24]: # Seaborn's heatmap can visualize the final results of the crosstab sns.heatmap(pd.crosstab(df.drive_wheels, df.make)) # In[25]: # This is a more complex customization of a heatmap sns.heatmap(pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels]), cmap="YlGnBu", annot=True, cbar=False) # In[26]: sns.heatmap(pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0)) # In[27]: sns.heatmap(pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean', margins=True, margins_name='Average'), cmap="YlGnBu", annot=True, cbar=False, fmt='.0f')