#!/usr/bin/env python # coding: utf-8 # ## 2D kernel density estimation and its Plotly plot ## # We have two `Excel` files with two columns. We read the files into two `pandas` dataframes and plot # for each of them an estimate of the joint distribution of the corresponding two columns. The joint distribution is calcalutated by `scipy.stats.gaussian_kde` [function](http://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.gaussian_kde.html). # In[23]: import numpy as np import pandas as pd import seaborn as sns import numpy as np import scipy.stats as st import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # Read the first file: # In[24]: xl = pd.ExcelFile("Data/CSCEng.xls") dfc = xl.parse("Sheet1") dfc.columns # and the seconed one: # In[25]: xl = pd.ExcelFile("Data/SystEng.xls") dfi = xl.parse("Sheet1") dfi.columns # The contour plot of the joint distribution of two variables (columns) is colored with a custom colorscale: # In[26]: cubehelix_cs=[[0.0, '#fcf9f7'], [0.16666666666666666, '#edcfc9'], [0.3333333333333333, '#daa2ac'], [0.5, '#bc7897'], [0.6666666666666666, '#925684'], [0.8333333333333333, '#5f3868'], [1.0, '#2d1e3e']] # The function `kde_scipy` returns data for Plotly contour plot of the estimated 2D distribution: # In[27]: def kde_scipy( vals1, vals2, (a,b), (c,d), N ): #vals1, vals2 are the values of two variables (columns) #(a,b) interval for vals1; usually larger than (np.min(vals1), np.max(vals1)) #(c,d) -"- vals2 x=np.linspace(a,b,N) y=np.linspace(c,d,N) X,Y=np.meshgrid(x,y) positions = np.vstack([Y.ravel(), X.ravel()]) values = np.vstack([vals1, vals2]) kernel = st.gaussian_kde(values) Z = np.reshape(kernel(positions).T, X.shape) return [x, y, Z] # ### Contour plot of the joint distribution of data from the first file ### # In[28]: import plotly.plotly as py from plotly.graph_objs import * # In[29]: def make_kdeplot(varX, varY, (a,b), (c,d), N, colorsc, title): #varX, varY are lists, 1d numpy.array(s), or dataframe columns, storing the values of two variables x, y, Z = kde_scipy(varY, varX, (a,b), (c,d), N ) data = Data([ Contour( z=Z, x=x, y=y, colorscale=colorsc, #reversescale=True, opacity=0.9, contours=Contours( showlines=False) ), ]) layout = Layout( title= title, font= Font(family='Georgia, serif', color='#635F5D'), showlegend=False, autosize=False, width=650, height=650, xaxis=XAxis( range=[a,b], showgrid=False, nticks=7 ), yaxis=YAxis( range=[c,d], showgrid=False, nticks=7 ), margin=Margin( l=40, r=40, b=85, t=100, ), ) return Figure( data=data, layout=layout ) # In[30]: N=200 a,b=(5,11) fig=make_kdeplot(dfc['multiannual'], dfc['bachelor-th'], (a,b), (a,b), N, cubehelix_cs,'kde plot of two sets of data' ) py.sign_in('empet', 'my_api_key') py.iplot(fig, filename='kde-2D-CSCE') # ### Contour plot of the joint distribution of data from the second file ### # In[31]: a, b=(4,12) fig=make_kdeplot(dfi['multiannual'], dfi['bachelor-th'], (a,b), (a,b), N, cubehelix_cs, 'kde plot of two sets of data') py.iplot(fig, filename='kde-2D-SE') # One notices that the second contourplot illustrates a [mixture of two bivariate # distributions](https://en.wikipedia.org/wiki/Mixture_distribution). # Finally we read a dataframe from a csv file posted on the Plotly's github account, select the rows corresponding to `Iris-virginica`, and plot the joint distribution of two virginica features: # In[32]: df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/iris.csv') virginica = df.loc[df.Name == "Iris-virginica"] a, b=(5,8.5) c,d=(2,4) N=100 fig=make_kdeplot(virginica.SepalLength, virginica.SepalWidth, (a,b), (c,d), N, cubehelix_cs, 'kde plot of joint distribution for virginica SepalLength and SepalWidth') py.iplot(fig, filename='virginica-sepal-length-vs-width') # In[33]: from IPython.core.display import HTML def css_styling(): styles = open("./custom.css", "r").read() return HTML(styles) css_styling()