#!/usr/bin/env python
# coding: utf-8

# ## Correlation Plot
# 
# The `corr_plot` builder takes a dataframe (can be Pandas `Dataframe` or just Python `dict`) as the input and 
# builds a correlation plot.
# 
# It allows to combine 'tile', 'point' or 'label' layers in a matrix of 'full', 'lower' or 'upper' type.
# 
# A call to the terminal `build()` method will create a resulting 'plot' object. 
# This 'plot' object can be further refined using regular Lets-Plot (ggplot) API, like `+ ggtitle()`, `+ ggsize()` and so on.
# 
# 
# The Ames Housing dataset for this demo was downloaded from [House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv) (train.csv), (c) Kaggle.

# In[1]:


import numpy as np
import pandas as pd

from lets_plot import *
from lets_plot.bistro.corr import *

LetsPlot.setup_html()


# In[2]:


mpg_df = pd.read_csv('https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/mpg.csv')\
    .drop(columns=['Unnamed: 0']).select_dtypes(include=np.number)
print(mpg_df.shape)
mpg_df.head()


# In[3]:


def group(plots, width=400, height=300):
    """
    Useful for this demo.
    """
    bunch = GGBunch()
    for idx, p in enumerate(plots):
        x = (idx % 2) * width
        y = int(idx / 2) * height
        bunch.add_plot(p, x, y, width, height)
        
    return bunch    


# ### Combining 'tile', 'point' and 'label' layers.
# 
# When combining layers, `corr_plot` chooses an acceptable plot configuration by default.

# In[4]:


group([
    corr_plot(mpg_df).tiles().build() + ggtitle("Tiles"),
    corr_plot(mpg_df).points().build() + ggtitle("Points"), 
    corr_plot(mpg_df).tiles().labels().build() + ggtitle("Tiles and labels"),
    corr_plot(mpg_df).points().labels().tiles().build() + ggtitle("Tiles, points and labels")
])


# The default plot configuration adapts to the changing options - compare 'Tiles and labels' plot above and below.
# 
# You can also override the default plot configuration using the parameter 'type' - compare 'Tiles, points and labels' plot above and below.

# In[5]:


group([
    corr_plot(mpg_df).tiles().labels(color="white").build() + ggtitle("Tiles and labels"),
    (corr_plot(mpg_df)
     .tiles(type="upper")
     .points(type="lower")
     .labels(type="full").build() + ggtitle("Tiles, points and labels"))
])


# ### Customizing colors.
# 
# Instead of the default blue-grey-red gradient you can define your own lower-middle-upper colors, or 
# choose one of the available 'Brewer' diverging palettes.
# 
# Let's create a gradient resembling one of Seaborn gradients.

# In[6]:


bld = corr_plot(mpg_df).points().labels().tiles()

# Configure gradient resembling one of Seaborn gradients.
gradient = (bld
            .palette_gradient(low='#417555', mid='#EDEDED', high='#963CA7')
            .build()) + ggtitle("Custom gradient")

# Configure Brewer 'BrBG' palette.
brewer = (bld
            .palette_BrBG()
            .build()) + ggtitle("Brewer")


# In[7]:


group([
    gradient,
    brewer
])


# ### 
# ### Correlation plot with large number of variables in dataset.
# 
# The [Kaggle House Prices](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv) dataset contains 81 variables.

# In[8]:


housing_df = pd.read_csv("https://raw.githubusercontent.com/JetBrains/lets-plot-docs/master/data/Ames_house_prices_train.csv")\
    .select_dtypes(include=np.number)
print(housing_df.shape)
housing_df.head()


# 
# Correlation plot that shows all the correlations in this dataset is too large and barely useful. 

# In[9]:


corr_plot(housing_df).tiles(type='lower').palette_BrBG().build()


# 
# #### The 'threshold' parameter.
# 
# The 'threshold' parameter let us specify a level of significance, below which variables are not shown.

# In[10]:


(corr_plot(housing_df, threshold=.5).tiles(diag=False).palette_BrBG().build() 
 + ggtitle("Threshold: 0.5")
 + ggsize(550, 400))


# 
# Let's further increase our threshold in order to see only highly correlated variables.
# 
# 

# In[11]:


(corr_plot(housing_df, threshold=.8)
 .tiles(diag=False)
 .palette_BrBG().build() 
 + ggtitle("Threshold: 0.8")
 + ggsize(550, 400))