#!/usr/bin/env python
# coding: utf-8

# ## Pandas Crosstab Explained
# 
# Notebook that accompanies [article](http://pbpython.com/pandas-crosstab.html) on Practical Business Python

# In[1]:


import pandas as pd
import seaborn as sns


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')


# ### Import the data

# In[3]:


# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]


# In[4]:


# Read in the CSV file and convert "?" to NaN
df_raw = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df_raw.head()


# In[5]:


# Take a quick look at all the values in the data
df_raw.describe()


# In[6]:


# Filter out the top 8 manufacturers
models = ["toyota","nissan","mazda", "honda", "mitsubishi", "subaru", "volkswagen", "volvo"]


# In[7]:


df = df_raw[df_raw.make.isin(models)].copy()


# In[8]:


df.head()


# In[9]:


### Basic Crosstab functions


# In[10]:


# Create a simple crosstab that counts the number of occurences of each combination
pd.crosstab(df.make, df.num_doors)


# In[11]:


# Add a subtotal
pd.crosstab(df.make, df.num_doors, margins=True, margins_name="Total")


# In[12]:


# Another example, this time of make and body_style
pd.crosstab(df.make, df.body_style)


# In[13]:


# Add custom names for the rows and columns?
pd.crosstab(df.make, df.body_style, rownames=['Auto Manufacturer'], colnames=['Body Style'])


# ### Normalizing Results

# In[14]:


# Convert the occurrences to percentages
pd.crosstab(df.make, df.body_style, normalize=True)


# In[15]:


# Convert the occurrences to percentages for each row
pd.crosstab(df.make, df.body_style, normalize='index')


# In[16]:


# Convert the occurrences to percentages for each column
pd.crosstab(df.make, df.body_style, normalize='columns')


# In[17]:


# If you want to make the percentages a little easier to see, multiple all values by 100
pd.crosstab(df.make, df.body_style, normalize='columns').mul(100).round(0)


# ### Custom Aggregations and Grouping

# In[18]:


# Perform aggregation functions - not just a simple count
pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0)


# In[19]:


pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-')


# In[20]:


# Crosstab supports grouping as well. In this case, group the columns
pd.crosstab(df.make, [df.body_style, df.drive_wheels])


# In[21]:


# A more complex example showing the grouping of rows and columns
pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels], 
            rownames=['Auto Manufacturer', "Doors"], 
            colnames=['Body Style', "Drive Type"], 
            dropna=False)


# In[22]:


# You can also use agg functions when grouping
pd.crosstab(df.make, [df.body_style, df.drive_wheels], values=df.curb_weight, aggfunc='mean').fillna('-')


# In[23]:


# You can also use margins when grouping
pd.crosstab(df.make, [df.body_style, df.drive_wheels], 
            values=df.curb_weight, aggfunc='mean', margins=True,
            margins_name='Average').fillna('-').round(0)


# ### Visualizing results with Seaborn

# In[24]:


# Seaborn's heatmap can visualize the final results of the crosstab
sns.heatmap(pd.crosstab(df.drive_wheels, df.make))


# In[25]:


# This is a more complex customization of a heatmap
sns.heatmap(pd.crosstab([df.make, df.num_doors], [df.body_style, df.drive_wheels]), cmap="YlGnBu", 
            annot=True, cbar=False)


# In[26]:


sns.heatmap(pd.crosstab(df.make, df.body_style, values=df.curb_weight, aggfunc='mean').round(0))


# In[27]:


sns.heatmap(pd.crosstab(df.make, [df.body_style, df.drive_wheels], 
            values=df.curb_weight, aggfunc='mean', margins=True, margins_name='Average'),
           cmap="YlGnBu", annot=True, cbar=False, fmt='.0f')