#!/usr/bin/env python
# coding: utf-8

# ## 2D kernel density estimation and its Plotly  plot    ##

# We have two `Excel` files with two columns. We read the files into two `pandas` dataframes and plot
# for each of them an estimate of the joint distribution of the corresponding two columns. The joint distribution is calcalutated by `scipy.stats.gaussian_kde` [function](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.gaussian_kde.html).     

# In[23]:


import numpy as np
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as st

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# Read the first file:

# In[24]:


xl = pd.ExcelFile("Data/CSCEng.xls")
dfc = xl.parse("Sheet1")
dfc.columns


# and the seconed one:

# In[25]:


xl = pd.ExcelFile("Data/SystEng.xls")
dfi = xl.parse("Sheet1")
dfi.columns


# The contour plot of the joint distribution of two variables (columns) is colored with a custom  colorscale: 

# In[26]:


cubehelix_cs=[[0.0, '#fcf9f7'],
 [0.16666666666666666, '#edcfc9'],
 [0.3333333333333333, '#daa2ac'],
 [0.5, '#bc7897'],
 [0.6666666666666666, '#925684'],
 [0.8333333333333333, '#5f3868'],
 [1.0, '#2d1e3e']]


# The function `kde_scipy` returns data for Plotly contour plot of the estimated 2D distribution:

# In[27]:


def kde_scipy( vals1, vals2, (a,b), (c,d), N ):
    
    #vals1, vals2 are the values of two variables (columns)
    #(a,b) interval for vals1; usually larger than (np.min(vals1), np.max(vals1))
    #(c,d) -"-          vals2 
    
    x=np.linspace(a,b,N)
    y=np.linspace(c,d,N)
    X,Y=np.meshgrid(x,y)
    positions = np.vstack([Y.ravel(), X.ravel()])

    values = np.vstack([vals1, vals2])
    kernel = st.gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)
    
    return [x, y, Z]


# ### Contour plot of the joint distribution of data from the first file ###

# In[28]:


import plotly.plotly as py
from plotly.graph_objs import *   


# In[29]:


def make_kdeplot(varX, varY, (a,b), (c,d), N, colorsc, title):
    #varX, varY are lists, 1d numpy.array(s), or dataframe columns, storing the values of two variables
   
    x, y, Z = kde_scipy(varY, varX, (a,b), (c,d), N )
    
    data = Data([
       Contour(
           z=Z, 
           x=x,
           y=y,
           colorscale=colorsc,
           #reversescale=True,
           opacity=0.9,    
           contours=Contours(
               showlines=False)      
        ),        
     ])

    layout = Layout(
        title= title,  
        font= Font(family='Georgia, serif',  color='#635F5D'),
        showlegend=False,
        autosize=False,
        width=650,
        height=650,
        xaxis=XAxis(
            range=[a,b],
            showgrid=False,
            nticks=7
        ),
        yaxis=YAxis(
            range=[c,d],
            showgrid=False,
            nticks=7
        ),
        margin=Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
    )
     
    return Figure( data=data, layout=layout )


# In[30]:


N=200
a,b=(5,11)
fig=make_kdeplot(dfc['multiannual'], dfc['bachelor-th'], (a,b), (a,b), 
                 N, cubehelix_cs,'kde plot of two sets of data' )

py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='kde-2D-CSCE')


# ### Contour plot of the joint distribution of data from the second file ###

# In[31]:


a, b=(4,12)
fig=make_kdeplot(dfi['multiannual'], dfi['bachelor-th'], (a,b), (a,b),
                 N, cubehelix_cs, 'kde plot of two sets of data')
py.iplot(fig, filename='kde-2D-SE')


# One notices that the second contourplot illustrates   a [mixture of two bivariate
# distributions](https://en.wikipedia.org/wiki/Mixture_distribution).

# Finally we read a dataframe from a csv  file posted on the Plotly's github account, select  the rows corresponding to `Iris-virginica`, and plot the joint distribution of two virginica features:

# In[32]:


df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/iris.csv')
virginica = df.loc[df.Name == "Iris-virginica"]
a, b=(5,8.5)
c,d=(2,4)
N=100
fig=make_kdeplot(virginica.SepalLength, virginica.SepalWidth, (a,b), (c,d),
    N, cubehelix_cs, 'kde plot of joint distribution for virginica SepalLength and SepalWidth')
py.iplot(fig,  filename='virginica-sepal-length-vs-width')


# In[33]:


from IPython.core.display import HTML
def  css_styling():
    styles = open("./custom.css", "r").read()
    return HTML(styles)
css_styling()