#!/usr/bin/env python # coding: utf-8 # ## Correlation Plot # # The `corr_plot` builder takes a dataframe (can be Pandas `Dataframe` or just Python `dict`) as the input and # builds a correlation plot. # # It allows to combine 'tile', 'point' or 'label' layers in a matrix of 'full', 'lower' or 'upper' type. # # A call to the terminal `build()` method will create a resulting 'plot' object. # This 'plot' object can be further refined using regular Lets-Plot (ggplot) API, like `+ ggtitle()`, `+ ggsize()` and so on. # # # The Ames Housing dataset for this demo was downloaded from [House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv) (train.csv), (c) Kaggle. # In[1]: import numpy as np import pandas as pd from lets_plot import * from lets_plot.bistro.corr import * LetsPlot.setup_html() # In[2]: mpg_df = pd.read_csv('https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv')\ .drop(columns=['Unnamed: 0']).select_dtypes(include=np.number) print(mpg_df.shape) mpg_df.head() # In[3]: def group(plots, width=400, height=300): """ Useful for this demo. """ bunch = GGBunch() for idx, p in enumerate(plots): x = (idx % 2) * width y = int(idx / 2) * height bunch.add_plot(p, x, y, width, height) return bunch # ### Combining 'tile', 'point' and 'label' layers. # # When combining layers, `corr_plot` chooses an acceptable plot configuration by default. # In[4]: group([ corr_plot(mpg_df).tiles().build() + ggtitle("Tiles"), corr_plot(mpg_df).points().build() + ggtitle("Points"), corr_plot(mpg_df).tiles().labels().build() + ggtitle("Tiles and labels"), corr_plot(mpg_df).points().labels().tiles().build() + ggtitle("Tiles, points and labels") ]) # The default plot configuration adapts to the changing options - compare 'Tiles and labels' plot above and below. # # You can also override the default plot configuration using the parameter 'type' - compare 'Tiles, points and labels' plot above and below. # In[5]: group([ corr_plot(mpg_df).tiles().labels(color="white").build() + ggtitle("Tiles and labels"), (corr_plot(mpg_df) .tiles(type="upper") .points(type="lower") .labels(type="full").build() + ggtitle("Tiles, points and labels")) ]) # ### Customizing colors. # # Instead of the default blue-grey-red gradient you can define your own lower-middle-upper colors, or # choose one of the available 'Brewer' diverging palettes. # # Let's create a gradient resembling one of Seaborn gradients. # In[6]: bld = corr_plot(mpg_df).points().labels().tiles() # Configure gradient resembling one of Seaborn gradients. gradient = (bld .palette_gradient(low='#417555', mid='#EDEDED', high='#963CA7') .build()) + ggtitle("Custom gradient") # Configure Brewer 'BrBG' palette. brewer = (bld .palette_BrBG() .build()) + ggtitle("Brewer") # In[7]: group([ gradient, brewer ]) # ### # ### Correlation plot with large number of variables in dataset. # # The [Kaggle House Prices](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv) dataset contains 81 variables. # In[8]: housing_df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/Ames_house_prices_train.csv")\ .select_dtypes(include=np.number) print(housing_df.shape) housing_df.head() # # Correlation plot that shows all the correlations in this dataset is too large and barely useful. # In[9]: corr_plot(housing_df).tiles(type='lower').palette_BrBG().build() # # #### The 'threshold' parameter. # # The 'threshold' parameter let us specify a level of significance, below which variables are not shown. # In[10]: (corr_plot(housing_df, threshold=.5).tiles(diag=False).palette_BrBG().build() + ggtitle("Threshold: 0.5") + ggsize(550, 400)) # # Let's further increase our threshold in order to see only highly correlated variables. # # # In[11]: (corr_plot(housing_df, threshold=.8) .tiles(diag=False) .palette_BrBG().build() + ggtitle("Threshold: 0.8") + ggsize(550, 400))