In [1]:
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
In [2]:
# <script>
#   jQuery(document).ready(function($) {

#   $(window).load(function(){
#     $('#preloader').fadeOut('slow',function(){$(this).remove();});
#   });

#   });
# </script>
In [3]:
# <style type="text/css">
#   div#preloader { position: fixed;
#       left: 0;
#       top: 0;
#       z-index: 999;
#       width: 100%;
#       height: 100%;
#       overflow: visible;
#       background: #fff url('http://preloaders.net/preloaders/720/Moving%20line.gif') no-repeat center center;
#   }

# </style>

# <div id="preloader"></div>
In [4]:
#hide_me
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import mpld3
mpld3.enable_notebook()

Example Crude Data Analysis

by Dillon Brout

Features

As a quick demonstration, let's take a look at a simple scatter plot:

In most browsers you should be able to use your mouse or trackpad to pan and zoom around the figure, exploring the data.

Here we'll define a random dataset, insert it into a database, and add a set of labels to associate with our points. You can mouse over the points to see their entries in the database.

In [5]:
import pandas as pd

# Define some CSS to control our custom labels
css = """
table
{
  border-collapse: collapse;
}
th
{
  color: #ffffff;
  background-color: #d3d3d3;
}
td
{
  background-color: #f0f8ff;
}
table, th, td
{
  font-family:Verdana, Geneva, sans-serif;
  border: 1px solid black;
  text-align: right;
  padding: 3px;
}
"""

fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'),figsize=(13,5))
#ax.grid(True, alpha=0.3)
ax.grid(color='white', linestyle='solid')


N = 20
df = pd.DataFrame(index=range(N))
df['x'] = np.random.randn(N)
df['y'] = np.random.randn(N)
df['Size'] = abs(np.random.randn(N))

labels = []
for i in range(N):
    label = df.ix[[i], :].T
    label.columns = ['Point Name {0}'.format(i)]
    labels.append(str(label.to_html()))


scatter = ax.scatter(df.x, df.y,
                 c=np.random.random(size=N),
                 s=1000*df.Size,
                 alpha=.3,cmap=plt.cm.jet)

ax.set_xlabel('x')
ax.set_ylabel('y')

tooltip = mpld3.plugins.PointHTMLTooltip(scatter, labels=labels, voffset=10, hoffset=10, css=css)
mpld3.plugins.connect(fig, tooltip)

Public Dataset #1

I've gone ahead applied this plotting technique to a CDC dataset on vehicle deaths

Using pandas I was able to grab the dataset easily

import pandas as pd
query = ("https://data.cdc.gov/resource/6ce7-3zab.json")
df = pd.read_json(query).dropna(0)
print df[:5]
In [6]:
#query = ("https://data.cdc.gov/resource/6ce7-3zab.json")
query = "6ce7-3zab.json"
df = pd.read_json(query).dropna(0)
df = df.drop(':@computed_region_hjsp_umg2', 1)
df = df.drop(':@computed_region_skr5_azej', 1)
df = df.drop('state', 1)
df = df.drop('state_state', 1)

df[:5]
Out[6]:
age_0_20 age_0_20_2014 age_21_34 age_21_34_2014 age_35_54 age_35_54_2014 age_55 age_55_2014 all_ages all_ages_2014 female female_2014 male male_2014 state_not_geocoded
1 2.6 3.4 7.9 7.5 4.3 6.4 7.3 7.0 5.1 6.0 4.2 7.6 5.9 6.0 Oregon
2 4.5 5.4 12.0 11.4 8.8 7.3 10.9 10.6 8.6 8.6 6.9 6.5 10.4 10.6 Idaho
4 2.1 2.6 5.3 6.3 4.2 3.6 4.3 5.1 3.9 4.3 2.5 2.9 5.2 5.7 Washington
7 3.6 3.7 9.6 10.2 7.6 6.1 8.0 6.9 6.9 6.5 4.7 4.8 9.1 8.2 Ohio
8 4.5 2.7 11.1 8.7 6.4 5.9 7.3 7.9 6.8 6.1 4.4 4.0 9.2 8.2 Pennsylvania

and we can uncover some differences in death rates amongst the states in the dataset.

In [7]:
weage = (10.*df.age_0_20 + 28.*df.age_21_34 + 45.*df.age_35_54 + 68.*df.age_55)/(df.age_0_20 +df.age_21_34 +df.age_35_54 +df.age_55)

rat = df.male/(df.male+df.female)

fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'),figsize=(13,5))
N = 100

scatter = ax.scatter(rat,
                     df.sum(axis=1),
                     s=1000 * (weage/np.max(weage))**10,
                     alpha=0.3,
                     cmap=plt.cm.jet)
ax.grid(color='white', linestyle='solid')
ax.set_xlabel('Percentage Male among Vehicle Deaths',size=20)
ax.set_ylabel('Vehicle Deaths per 100,000',size=20)
ax.yaxis.set_label_coords(-0.07, 0.5)
ax.xaxis.set_label_coords(.5, -.08)

labels = [str(i)+' Mean Age: '+str(round(wa,2)) for i,wa in zip(df.state_not_geocoded,weage)]
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
#df

As you move your mouse around this plot, you should see the labels pop up as you hover over the points.

Toy Dataset

I'll create another toy dataset and we'll randomly draw a number of periods and amplitudes, and from these create a sinusoidal time-series.

There are many ways in which this might be accomplished, but we'll take the approach of showing the desired line after hovering over the associated point.

In [8]:
#hide_me
from mpld3 import plugins, utils


class LinkedView(plugins.PluginBase):
    """A simple plugin showing how multiple axes can be linked"""

    JAVASCRIPT = """
    mpld3.register_plugin("linkedview", LinkedViewPlugin);
    LinkedViewPlugin.prototype = Object.create(mpld3.Plugin.prototype);
    LinkedViewPlugin.prototype.constructor = LinkedViewPlugin;
    LinkedViewPlugin.prototype.requiredProps = ["idpts", "idline", "data"];
    LinkedViewPlugin.prototype.defaultProps = {}
    function LinkedViewPlugin(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    LinkedViewPlugin.prototype.draw = function(){
      var pts = mpld3.get_element(this.props.idpts);
      var line = mpld3.get_element(this.props.idline);
      var data = this.props.data;

      function mouseover(d, i){
        line.data = data[i];
        line.elements().transition()
            .attr("d", line.datafunc(line.data))
            .style("stroke", this.style.fill);
      }
      pts.elements().on("mouseover", mouseover);
    };
    """

    def __init__(self, points, line, linedata):
        if isinstance(points, matplotlib.lines.Line2D):
            suffix = "pts"
        else:
            suffix = None

        self.dict_ = {"type": "linkedview",
                      "idpts": utils.get_id(points, suffix),
                      "idline": utils.get_id(line),
                      "data": linedata}

Instead of plotting all the lines here, we plot just a single line instance which the plugin will modify. The result is a much more compelling visualization of the data.

In [9]:
#hide_me
import matplotlib
fig, ax = plt.subplots(2,figsize=(13,7))

# scatter periods and amplitudes
np.random.seed(0)
P = 0.2 + np.random.random(size=20)
A = np.random.random(size=20)
x = np.linspace(0, 10, 100)
data = np.array([[x, Ai * np.sin(x / Pi)]
                 for (Ai, Pi) in zip(A, P)])
points = ax[1].scatter(P, A, c=P + A,
                       s=200, alpha=0.5)
ax[1].set_xlabel('Period')
ax[1].set_ylabel('Amplitude')

# create the line object
lines = ax[0].plot(x, 0 * x, '-w', lw=3, alpha=0.5)
ax[0].set_ylim(-1, 1)
ax[0].set_ylabel('Position')

ax[0].set_title("Hover over points to see lines",size=10)

# transpose line data and add plugin
linedata = data.transpose(0, 2, 1).tolist()
plugins.connect(fig, LinkedView(points, lines[0], linedata))

Notice that as you hover over each point, the line in the top plot responds.

In [10]:
class VoronoiHighlightLines(plugins.PluginBase):
    """A plugin to highlight lines on hover
    
    
    see also http://bl.ocks.org/mbostock/8033015
    
    """

    JAVASCRIPT = """
    mpld3.register_plugin("voronoi_highlightlines", VoronoiHighlightLines);
    VoronoiHighlightLines.prototype = Object.create(mpld3.Plugin.prototype);
    VoronoiHighlightLines.prototype.constructor = VoronoiHighlightLines;
    VoronoiHighlightLines.prototype.requiredProps = ["line_ids"];
    VoronoiHighlightLines.prototype.defaultProps = {alpha_bg:0.3, alpha_fg:1.0}
    function VoronoiHighlightLines(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    VoronoiHighlightLines.prototype.draw = function(){
      var alpha_fg = this.props.alpha_fg;
      var alpha_bg = this.props.alpha_bg;    
      
      // get the data for the voronoi mesh
      data = new Array();
      for(var i=0; i<this.props.line_ids.length; i++){
         var line_instance = mpld3.get_element(this.props.line_ids[i], this.fig);
         
         for (j=1; j<line_instance.data.length; j++){
             var obj = {}
             obj.x = line_instance.data[j][line_instance.props.xindex]
             obj.y = line_instance.data[j][line_instance.props.yindex]
             obj.line_instance = line_instance
             obj.line_id = this.props.line_ids[i]
             obj.label_id = i
             obj.fig = this.fig
             data.push(obj)
         }
      }

      var ax = mpld3.get_element(this.props.line_ids[0], this.fig).ax

      // we hide the transform from data coordinates to svg
      // coordinates in the voronoi
      var transform_x = function(d){return ax.x(d)+ax.position[0]};
      var transform_y = function(d){return ax.y(d)+ax.position[1]};
      
      var voronoi = d3.geom.voronoi()
                    .x(function(d) { return transform_x(d.x); })
                    .y(function(d) { return transform_y(d.y); })      
                    .clipExtent([ax.position, [ax.position[0]+ax.width, ax.position[1]+ax.height]]);

      

      var voronoiGroup =  this.fig.canvas.append("svg:g")
                  .attr("class", "voronoi");
      
      voronoiGroup.selectAll("path")
          .data(voronoi(d3.nest()
                      .key(function(d) { return d.x + "," + d.y; })
                      .rollup(function(v) { return v[0]; })
                      .entries(data)
                      .map(function(d) { return d.values; })))
        .enter().append("path")
          .attr("d", function(d) { 
                    var ret = "M" + d.join(" L") + "Z";
                    return ret; })
          .datum(function(d) {return d.point; })
          .on("mouseover", mouseover)
          .on("mouseout", mouseout);
    
    
      function mouseover(d) {
        d3.select(d.line_instance.path[0][0]).transition().duration(100)
            .style("stroke-opacity", alpha_fg);        
      }

      function mouseout(d) {
         d3.select(d.line_instance.path[0][0]).transition().duration(200)
                .style("stroke-opacity", alpha_bg);  
      }    
    };
    """

    def __init__(self, lines, css):
        
        self.css_ = css or ""

        self.lines = lines
        self.dict_ = {"type": "voronoi_highlightlines",
                      "line_ids": [utils.get_id(line) for line in lines],
                      "alpha_bg": lines[0].get_alpha(),
                      "alpha_fg": 1.0}


# controls the coloring etc. of the voronoi mesh
css = """
.voronoi path 
{
  fill: none;
  pointer-events: all;
  stroke: red;
  stroke-opacity: .0;  
}
"""

Public Dataset #2

Here I have downloaded and parsed an example dataset of police logs using Pandas

Here are the first 10 entries in my database of noise complaints in the City of LA

import pandas as pd
import datetime

query = ("https://data.lacity.org/resource/mgue-vbsx.json?"
    "$group=date"
    "&call_type_code=507P"
    "&$select=date_trunc_ymd(dispatch_date)%20AS%20date%2C%20count(*)"
    "&$order=date")

raw_data = pd.read_json(query)
In [11]:
import datetime

query = ("https://data.lacity.org/resource/mgue-vbsx.json?"
    "$group=date"
    "&call_type_code=507P"
    "&$select=date_trunc_ymd(dispatch_date)%20AS%20date%2C%20count(*)"
    "&$order=date")
query = "mgue-vbsx.json"
raw_data = pd.read_json(query)

raw_data['day_of_week'] = [date.dayofweek for date in raw_data["date"]]
raw_data['week'] = [(date - datetime.timedelta(days=date.dayofweek)).strftime("%Y-%m-%d") for date in raw_data["date"]]
 
# Pivot our data to get the matrix we need
data = raw_data.pivot(index='week', columns='day_of_week', values='count')
data = data.fillna(value=0)
 
# Get our "weeks" and "days"
weeks = list(data.index)
days = ["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"]
raw_data[0:10]
Out[11]:
count date day_of_week week
0 285 2014-01-01 2 2013-12-30
1 16 2014-01-02 3 2013-12-30
2 33 2014-01-03 4 2013-12-30
3 134 2014-01-04 5 2013-12-30
4 101 2014-01-05 6 2013-12-30
5 11 2014-01-06 0 2014-01-06
6 17 2014-01-07 1 2014-01-06
7 12 2014-01-08 2 2014-01-06
8 10 2014-01-09 3 2014-01-06
9 55 2014-01-10 4 2014-01-06

We can now plot noise complaints as a function of the day of the week and as a function of month of the year interactively!

In [12]:
import scipy.interpolate
weeknum = []
bigarr = np.zeros((12,7))
for idw,w in enumerate(weeks):
    weeknum.append(idw)
    month = int(w.split('-')[1]) - 1
    for idx, day in enumerate(days):
        bigarr[month,idx] += int(data.loc[w][idx])
    
fig, ax = plt.subplots(figsize=(13,8))
ax.set_color_cycle(['red', 'blue', 'gold','orange','magenta','green','pink','grey','turquoise','black','lime','dodgerblue'])
lines = ax.plot(np.arange(7), bigarr.T, lw=6, alpha=0.1)
ax.set_xlabel('Day of Week',size=20)
ax.set_ylabel('Number of Noise Complaints',size=20)

ax.yaxis.set_label_coords(-0.07, 0.5)
ax.xaxis.set_label_coords(.5, -.08)


leg = ax.legend(iter(lines), ('Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'),loc=0)
for l in leg.get_lines():
    l.set_alpha(1)
leg.set_title("")
plugins.clear(fig)
plugins.connect(fig, VoronoiHighlightLines(lines, css))

Bayesian Model Fitting

Here I will generate a synthetic dataset with an underlying model and present datapoints with underestimated uncertainties and demonstrate our ability to account for this using Bayesian statistical analysis.

import numpy as np

# Choose the "true" parameters.
m_true = -0.9594
b_true = 4.294
f_true = 0.534
# Generate some synthetic data from the model.
N = 50
x = np.sort(10*np.random.rand(N))
yerr = 0.1+0.5*np.random.rand(N)
y = m_true*x+b_true
y += np.abs(f_true*y) * np.random.randn(N)
y += yerr * np.random.randn(N)
In [13]:
import emcee
import corner
import numpy as np
import scipy.optimize as op
from matplotlib.ticker import MaxNLocator

# Reproducible results!
np.random.seed(123)

# Choose the "true" parameters.
m_true = -0.9594
b_true = 4.294
f_true = 0.534

# Generate some synthetic data from the model.
N = 50
x = np.sort(10*np.random.rand(N))
yerr = 0.1+0.5*np.random.rand(N)
y = m_true*x+b_true
y += np.abs(f_true*y) * np.random.randn(N)
y += yerr * np.random.randn(N)


# Plot the dataset and the true model.
xl = np.array([0, 10])

xl = np.array([0, 10])
plt.figure(figsize=(13,9))
plt.errorbar(x, y, yerr=yerr, fmt=".k")
plt.plot(xl, m_true*xl+b_true, c="orange", lw=3, alpha=0.6,label='True Model')
plt.ylim(-9, 9)
plt.xlabel("x",size=20)
plt.ylabel("y",size=20)
plt.legend(title='')
plt.title('Sim Data Points with True Model',size=25)
plt.tight_layout()

If we then apply a typical (non-bayesian) least squares fit we will obtain an incorrect fit of our dataset.

In [14]:
# Do the least-squares fit and compute the uncertainties.
A = np.vstack((np.ones_like(x), x)).T
C = np.diag(yerr * yerr)
cov = np.linalg.inv(np.dot(A.T, np.linalg.solve(C, A)))
b_ls, m_ls = np.dot(cov, np.dot(A.T, np.linalg.solve(C, y)))
In [15]:
fig, ax = plt.subplots(1,1,figsize=(13, 9))
xl = np.array([0, 10])   
ax.plot(xl, m_true*xl+b_true, color="orange", lw=2, alpha=0.8,label='True Model')
ax.plot(xl, m_ls*xl+b_ls, color='g', lw=3, alpha=0.6,label='Least Squares Fit Model')
ax.errorbar(x, y, yerr=yerr, fmt=".k")
ax.legend(title='')
ax.set_xlabel('x',size=20)
ax.set_ylabel('y',size=20)
ax.set_title('Model Comparison',size=25)
fig.tight_layout()
print ''

Maximum likelihood estimation

If we include an estimation of the uncertainty in our model then we can avoid the problem encountered by least squares. We need to write down the likelihood function and numerically optimize it. In mathematical notation, the correct likelihood function is:

$$\ln\,p(y\,|\,x,\sigma,m,b,f) = -\frac{1}{2} \sum_n \left[ \frac{(y_n-m\,x_n-b)^2}{s_n^2}

+ \ln \left ( 2\pi\,s_n^2 \right )

\right] $$

where

$$s_n^2 = \sigma_n^2+f^2\,(m\,x_n+b)^2 \quad $$

This likelihood function is simply a Gaussian where the variance is underestimated by some fractional amount: f. In Python, you would code this up as:

def lnlike(theta, x, y, yerr):
    m, b, lnf = theta
    model = m * x + b
    inv_sigma2 = 1.0/(yerr**2 + model**2*np.exp(2*lnf))
    return -0.5*(np.sum((y-model)**2*inv_sigma2 - np.log(inv_sigma2)))

Bayesian Uncertainty Estimation

Bayes formula tells us the posterior probability

$$p (m,b,f\,|\,x,y,\sigma) \propto p(m,b,f)\,p(y\,|\,x,\sigma,m,b,f) \quad $$

We have already, in the previous section, written down the likelihood function

$$p(y\,|\,x,\sigma,m,b,f)$$

so the missing component is the “prior” function

$$p(m,b,f) \quad $$

Which we will take to be a "flat prior"

def lnprior(theta):
    return 0.0

Such that we can proceed with our sampling of the model parameters and uncertainty estimation using lnprob()

def lnprob(theta, x, y, yerr):
    lp = lnprior(theta)
    if not np.isfinite(lp):
        return -np.inf
    return lp + lnlike(theta, x, y, yerr)
In [16]:
# Define the probability function as likelihood * prior.
def lnprior(theta):
    m, b, lnf = theta
    if -5.0 < m < 0.5 and 0.0 < b < 10.0 and -10.0 < lnf < 1.0:
        return 0.0
    return -np.inf

def lnlike(theta, x, y, yerr):
    m, b, lnf = theta
    model = m * x + b
    inv_sigma2 = 1.0/(yerr**2 + model**2*np.exp(2*lnf))
    return -0.5*(np.sum((y-model)**2*inv_sigma2 - np.log(inv_sigma2)))

def lnprob(theta, x, y, yerr):
    lp = lnprior(theta)
    if not np.isfinite(lp):
        return -np.inf
    return lp + lnlike(theta, x, y, yerr)

# Find the maximum likelihood value.
chi2 = lambda *args: -2 * lnlike(*args)
result = op.minimize(chi2, [m_true, b_true, np.log(f_true)], args=(x, y, yerr))
m_ml, b_ml, lnf_ml = result["x"]

Here I run a Markov Chain Monte Carlo in order to estimate our parameters.

#...
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))
#...
In [17]:
# Set up the sampler.
ndim, nwalkers = 3, 50
pos = [result["x"] + 1e-2*np.random.randn(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=(x, y, yerr))

# Clear and run the production chain.
#print("Running MCMC...")
a = sampler.run_mcmc(pos, 500, rstate0=np.random.get_state())
#print("Done.")

Below is the output of our sampling algorithm for our 3 parameter fit.

In [18]:
fig, axes = plt.subplots(3, 1, sharex=True, figsize=(13, 6))
axes[0].plot(sampler.chain[:, :, 0].T, color="k", alpha=0.4)
axes[0].yaxis.set_major_locator(MaxNLocator(5))
axes[0].axhline(m_true, color="#888888", lw=2)
axes[0].set_ylabel("m",size=20)

axes[1].plot(sampler.chain[:, :, 1].T, color="k", alpha=0.4)
axes[1].yaxis.set_major_locator(MaxNLocator(5))
axes[1].axhline(b_true, color="#888888", lw=2)
axes[1].set_ylabel("b",size=20)

axes[2].plot(np.exp(sampler.chain[:, :, 2]).T, color="k", alpha=0.4)
axes[2].yaxis.set_major_locator(MaxNLocator(5))
#axes[2].axhline(f_true, color="#888888", lw=2)
axes[2].set_ylabel("f",size=20)
axes[2].set_xlabel("step number",size=20)

axes[0].yaxis.set_label_coords(-0.1, 0.5)
axes[1].yaxis.set_label_coords(-0.1, 0.5)
axes[2].yaxis.set_label_coords(-0.1, 0.5)
axes[2].xaxis.set_label_coords(.5, -.1)

fig.tight_layout(h_pad=0.0)
# plt.clf()