Kernel Density Estimation. Plotly plot of the joint pdf and marginal pdf estimations¶

In [1]:
import pandas as pd
import numpy as np


Read data from an Excel file:

In [2]:
xl = pd.ExcelFile("CSCEng.xls")
dfc = xl.parse("Sheet1")
dfc.columns

Out[2]:
Index([u'multiannual', u'bachelor-th'], dtype='object')
In [3]:
dfc.head()

Out[3]:
multiannual bachelor-th
0 8.01 7.95
1 8.63 8.63
2 7.03 8.37
3 8.53 8.05
4 8.41 9.53

We estimate the joint pdf of the two columns dfc['multiannual]', dfc['bachelor-th]', using a gaussian kernel:

In [4]:
import scipy.stats as st
def kde_scipy( vals1, vals2, (a,b), (c,d), N ):

#vals1, vals2 are the values of two variables
#(a,b) interval for vals1; usually larger than (np.min(vals1), np.max(vals1))
#(c,d) -"-          vals2

x=np.linspace(a,b,N)
y=np.linspace(c,d,N)
X,Y=np.meshgrid(x,y)
positions = np.vstack([Y.ravel(), X.ravel()]) #X.ravel() concatenates the  rows of X

values = np.vstack([vals1, vals2])
kernel = st.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)

return [x, y, Z]# return x, y, Z to be passed to Plotly for plotting the contour of joint pdf

In [5]:
a,b=(5,11) # joint pdf is evaluated at the  N xN grid points of the square [a,b] x[a,b]
N=200

In [6]:
x=list(dfc['multiannual'])
y=list(dfc['bachelor-th'])

In [7]:
pdfx= st.gaussian_kde(x) #estimation of the pdfx from x-values
pdfy=st.gaussian_kde(y)
X=np.linspace(a, b, 100)
Y=pdfx(X)#evaluate the pdfx at X
yy=np.linspace(a, b, 100)
xx=pdfy(yy)# the pdfy is a function of y-variable

In [8]:
Xvals, Yvals, Zvals = kde_scipy( dfc['bachelor-th'],dfc['multiannual'], (a,b), (a,b), N )
#attn: here we reversed the columns order


Define Data and Layout for Plotly plot:

In [9]:
import plotly.plotly as py
from plotly.graph_objs import *


Set the text to be displayed when hovering the mouse over the contour plot of the joint pdf:

In [22]:
hover_xy=[
['f('+'{:0.2f}'.format(Xvals[j])+', '+'{:0.2f}'.format(Yvals[i])+')= '+'{:0.2f}'.format(Zvals[i][j])+')'
for j in range(len(Xvals))] for i in range(len(Yvals)) ]

In [11]:
hover_xy[62][57]

Out[11]:
'f(6.72, 6.87)= 0.05)'

Plotly version of the matplotlib cmocean.salinity colormap:

In [12]:
pl_salinity=[[0.0, 'rgb(41,24,107)'],
[0.05, 'rgb(45,27,137)'],
[0.1, 'rgb(40,39,162)'],
[0.15, 'rgb(24,61,158)'],
[0.2, 'rgb(12,77,150)'],
[0.25, 'rgb(15,91,144)'],
[0.3, 'rgb(24,102,140)'],
[0.35, 'rgb(35,113,138)'],
[0.4, 'rgb(44,124,136)'],
[0.45, 'rgb(52,135,136)'],
[0.5, 'rgb(59,147,135)'],
[0.55, 'rgb(66,158,132)'],
[0.6, 'rgb(74,169,128)'],
[0.65, 'rgb(85,181,122)'],
[0.7, 'rgb(100,193,113)'],
[0.75, 'rgb(122,203,102)'],
[0.8, 'rgb(148,211,93)'],
[0.85, 'rgb(179,217,94)'],
[0.9, 'rgb(208,224,109)'],
[0.95, 'rgb(232,231,131)'],
[1.0, 'rgb(253,238,153)']]


Define a Contour object:

In [13]:
trace1= Contour(
z=Zvals,
x=Xvals,
y=Yvals,
colorscale=pl_salinity,
showscale=False,
text=hover_xy,
hoverinfo='text',
contours=Contours(
showlines=False),
)


Set hover text for the two marginal pdfs:

In [14]:
textx=['(x,g(x))=('+'{:0.2f}'.format(X[i])+', '+'{:0.2f}'.format(Y[i])+')' for i in range(len(X))]
texty=['(y,h(y))=('+'{:0.2f}'.format(yy[i])+', '+'{:0.2f}'.format(xx[i])+')' for i in range(len(yy))]

In [23]:
trace2 = Scatter(# Scatter object for the marginal pdf g(x)
x=X,
y=Y,
name='pdf-x',
mode='lines',
fill='tozeroy',
fillcolor='rgb(122,203,102)',
line=Line(width=2, color='rgb(66,158,132)', shape='spline'),
xaxis='x1',
yaxis='y2',
text=textx,
hoverinfo='text',

)
trace3 = Scatter(# Scatter object for the marginal pdf h(y)
x=xx,
y=yy,
name='pdf-y',
mode='lines',
fill='tozerox',
fillcolor='rgb(122,203,102)',
line=Line(width=2, color='rgb(66,158,132)', shape='spline'),
text=texty,
hoverinfo='text',
xaxis='x2',
yaxis='y1'
)

In [24]:
data = Data([trace1, trace2, trace3])


Set the plot layout:

In [26]:
layout=Layout(title='Kernel Density Estimation',
autosize=False,
font=Font(size=11),
height=550,
showlegend=False,
width=650,
xaxis=XAxis(
showgrid=False,
domain=[0, 0.8],
range=[a, b],
title='x',
titlefont=Font(size=11),
zeroline=False,
tickvals=[6,7,8,9,10, 11]
),
xaxis2=XAxis(
domain=[0.82, 1],
showgrid=False,
zeroline=False,
side='top',
ticklen=4,
),
yaxis=YAxis(
domain=[0, 0.8],
range=[a, b],
showgrid=False,
title='y',
zeroline=False,
titlefont=Font(size=11),
),
yaxis2=YAxis(
domain=[0.82, 1],
showgrid=False,
zeroline=False,
ticklen=4,

),
margin=Margin(t=50),
hovermode='closest',
)
fig = Figure(data=data, layout=layout)

In [27]:
import plotly
plotly.offline.init_notebook_mode()