#!/usr/bin/env python # coding: utf-8 # # Creating Interactive Visualizations with Bokeh # [Bokeh](http://bokeh.pydata.org) is a Python package for creating interactive, browser-based visualizations, and is well-suited for "big data" applications. # # * Bindings can (and have) been created for other languages. # # # # Bokeh allows users to create interactive html visualizations without using JS. # # Bokeh is a **language-based** visualization system. This allows for: # # * high-level commands for data binding, transformation, interaction # * low-level power to deeply customize # # Bokeh philosophy: # # > Make a smart choice when it is possible to do so automatically, and expose low-level capabilities when it is not. # # How does Bokeh work? # # Bokeh writes to a custom-built HTML5 Canvas library, which affords it high performance. This allows it to integrate with other web tools, such as Google Maps. # # Bokeh plots are based on visual elements called **glyphs** that are bound to data objects. # ## Installation # # Bokeh can be installed easily either via `pip` or `conda` (if using Anaconda): # # pip install bokeh # # conda install bokeh # ## A Simple Example # # First we'll import the bokeh.plotting module, which defines the graphical functions and primitives. # In[1]: import bokeh.plotting as bk # Next, we'll tell Bokeh to display its plots directly into the notebook. This will cause all of the Javascript and data to be embedded directly into the HTML of the notebook itself. (Bokeh can output straight to HTML files, or use a server, which we'll look at later.) # In[2]: bk.output_notebook() # Next, we'll import NumPy and create some simple data. # In[3]: import numpy as np x = np.linspace(-6, 6, 100) y = np.random.normal(0.3*x, 1) # Now we'll call Bokeh's `circle()` function to render a red circle at # each of the points in x and y. # # We can immediately interact with the plot: # # * click-and-drag will pan the plot around. # * Shift + mousewheel will zoom in and out # # In[4]: fig = bk.figure(plot_width=500, plot_height=500) fig.circle(x, y, color="red") bk.show(fig) # # Statistical Plots # # Let's try plotting multiple series on the same axes. # # First, we generate some data from an exponential distribution with mean $\theta=1$. # In[5]: from scipy.stats import expon theta = 1 measured = np.random.exponential(theta, 1000) hist, edges = np.histogram(measured, density=True, bins=50) # Next, create our figure, which is not displayed until we ask Bokeh to do so explicitly. We will customize the intractive toolbar, as well as customize the background color. # In[6]: fig = bk.figure(title="Exponential Distribution (θ=1)",tools="previewsave", background_fill="#E8DDCB") # The quad glyph displays axis-aligned rectangles with the given attributes. # In[7]: fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="#036564", line_color="#033649") # Next, add lines showing the form of the probability distribution function (PDF) and cumulative distribution function (CDF). # In[8]: x = np.linspace(0, 10, 1000) fig.line(x, expon.pdf(x, scale=1), line_color="#D95B43", line_width=8, alpha=0.7, legend="PDF") fig.line(x, expon.cdf(x, scale=1), line_color="white", line_width=2, alpha=0.7, legend="CDF") # Finally, add a legend before releasing the hold and displaying the complete plot. # In[9]: fig.legend.location = "top_right" bk.show(fig) # ## Bar Plot Example # # Bokeh's core display model relies on *composing graphical primitives* which are bound to data series. A more sophisticated example demonstrates this idea. # # Bokeh ships with a small set of interesting "sample data" in the `bokeh.sampledata` package. We'll load up some historical automobile fuel efficiency data, which is returned as a [Pandas](http://pandas.pydata.org) `DataFrame`. # In[10]: from bokeh.sampledata.autompg import autompg # We first need to reshape the data, by grouping it according to the year of the car, and then by the country of origin (here, USA or Japan). # In[11]: grouped = autompg.groupby("yr") mpg = grouped["mpg"] mpg_avg = mpg.mean() mpg_std = mpg.std() years = np.asarray(list(grouped.groups.keys())) american = autompg[autompg["origin"]==1] japanese = autompg[autompg["origin"]==3] american.head(10) # ![Fury](https://c1.staticflickr.com/5/4022/4312399177_4f39f17a4b_z.jpg?zz=1) # # For each year, we want to plot the distribution of MPG within that year. As a guide, we will include a box that represents the mean efficiency, plus and minus one standard deviation. We will make these boxes partly transparent. # In[12]: fig = bk.figure(title='Automobile mileage by year and country') fig.quad(left=years-0.4, right=years+0.4, bottom=mpg_avg-mpg_std, top=mpg_avg+mpg_std, fill_alpha=0.4) # Next, we overplot the actual data points, using contrasting symbols for American and Japanese cars. # In[13]: # Add Japanese cars as circles fig.circle(x=np.asarray(japanese["yr"]), y=np.asarray(japanese["mpg"]), size=8, alpha=0.4, line_color="red", fill_color=None, line_width=2) # Add American cars as triangles fig.triangle(x=np.asarray(american["yr"]), y=np.asarray(american["mpg"]), size=8, alpha=0.4, line_color="blue", fill_color=None, line_width=2) # We can add axis labels by binding them to the `axis_label` attribute of each axis. # In[14]: fig.xaxis.axis_label = 'Year' fig.yaxis.axis_label = 'MPG' # In[15]: bk.show(fig) # ## Linked Brushing # # To link plots together at a data level, we can explicitly wrap the data in a ColumnDataSource. This allows us to reference columns by name. # In[16]: variables = autompg.to_dict("list") variables.update({'yr':autompg["yr"]}) source = bk.ColumnDataSource(variables) # The `gridplot` function takes a 2-dimensional list containing elements to be arranged in a grid on the same canvas. # In[17]: plot_config = dict(plot_width=300, plot_height=300, tools="box_select,lasso_select,help") left = bk.figure(title="MPG by Year", **plot_config) left.circle("yr", "mpg", color="blue", source=source) center = bk.figure(title="HP vs. Displacement", **plot_config) center.circle("hp", "displ", color="green", source=source) right = bk.figure(title="MPG vs. Displacement", **plot_config) right.circle("mpg", "displ", size="cyl", line_color="red", source=source) # We can use the `select` tool to select points on one plot, and the linked points on the other plots will *automagically* highlight. # In[18]: p = bk.gridplot([[left, center, right]]) bk.show(p) # ## Visualization of US unemployment rates # # Our first example of an interactive chart involves generating a heat map of US unemployment rates by month and year. This plot will be made interactive by invoking a `HoverTool` that displays information as the pointer hovers over any cell within the plot. # First, we import the data with Pandas and manipulate it as needed. # In[20]: import pandas as pd from bokeh.models import HoverTool from bokeh.sampledata.unemployment1948 import data from collections import OrderedDict data['Year'] = [str(x) for x in data['Year']] years = list(data['Year']) months = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"] data = data.set_index('Year') # Specify a color map (where do we get color maps, you ask? -- Try [Color Brewer](http://colorbrewer2.org)) # In[21]: colors = [ "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce", "#ddb7b1", "#cc7878", "#933b41", "#550b1d" ] # Set up the data for plotting. We will need to have values for every pair of year/month names. Map the rate to a color. # In[22]: month = [] year = [] color = [] rate = [] for y in years: for m in months: month.append(m) year.append(y) monthly_rate = data[m][y] rate.append(monthly_rate) color.append(colors[min(int(monthly_rate)-2, 8)]) # Create a `ColumnDataSource` with columns: month, year, color, rate # In[23]: source = bk.ColumnDataSource( data=dict( month=month, year=year, color=color, rate=rate, ) ) # Create a new figure. # In[32]: fig = bk.figure(plot_width=900, plot_height=400, x_axis_location="above", tools="resize,hover", x_range=years, y_range=list(reversed(months)), title="US Unemployment (1948 - 2013)") # use the `rect renderer with the following attributes: # - `x_range` is years, `y_range` is months (reversed) # - fill color for the rectangles is the 'color' field # - `line_color` for the rectangles is `None` # - tools are resize and hover tools # - add a nice title, and set the `plot_width` and `plot_height` # In[33]: fig.rect('year', 'month', 0.95, 0.95, source=source, color='color', line_color=None) # Style the plot, including: # - remove the axis and grid lines # - remove the major ticks # - make the tick labels smaller # - set the x-axis orientation to vertical, or angled # In[35]: fig.grid.grid_line_color = None fig.axis.axis_line_color = None fig.axis.major_tick_line_color = None fig.axis.major_label_text_font_size = "5pt" fig.axis.major_label_standoff = 0 fig.xaxis.major_label_orientation = np.pi/3 # Configure the hover tool to display the month, year and rate # In[40]: hover = HoverTool( tooltips=OrderedDict([ ('date', '@month @year'), ('rate', '@rate'), ]) ) # Now we can display the plot. Try moving your pointer over different cells in the plot. # In[41]: bk.show(fig) # Similarly, we can provide a geographic heatmap, here using data just from Texas. # In[44]: from bokeh.sampledata import us_counties, unemployment from collections import OrderedDict # Longitude and latitude values for county boundaries county_xs=[ us_counties.data[code]['lons'] for code in us_counties.data if us_counties.data[code]['state'] == 'tx' ] county_ys=[ us_counties.data[code]['lats'] for code in us_counties.data if us_counties.data[code]['state'] == 'tx' ] # Color palette from colorbrewer2.org colors = ['#ffffd4','#fee391','#fec44f','#fe9929','#d95f0e','#993404'] # Assign colors based on unemployment county_colors = [] for county_id in us_counties.data: if us_counties.data[county_id]['state'] != 'tx': continue try: rate = unemployment.data[county_id] idx = min(int(rate/2), 5) county_colors.append(colors[idx]) except KeyError: county_colors.append("black") fig = bk.figure(tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", title="Texas Unemployment 2009") # Here are the polygons for plotting fig.patches(county_xs, county_ys, fill_color=county_colors, fill_alpha=0.7, line_color="white", line_width=0.5) # Configure hover tool hover = HoverTool( tooltips=OrderedDict([ ("index", "$index"), ("(x,y)", "($x, $y)"), ("fill color", "$color[hex, swatch]:fill_color"), ]) ) bk.show(fig) # # High-level Plots # # The examples so far have been relatively low-level, in that individual elements of plots need to be specified by hand. The `bokeh.charts` interface makes it easy to get up-and-running with a high-level API that tries to make smart layout and design decisions by default. # # To use them, you simply import the chart type you need from `bokeh.charts`: # # * `Bar` # * `BoxPlot` # * `HeatMap` # * `Histogram` # * `Scatter` # * `Timeseries` # To illustrate, let's create some random data and display it as histograms. # In[45]: normal = np.random.standard_normal(1000) student_t = np.random.standard_t(6, 1000) distributions = pd.DataFrame({'Normal': normal, 'Student-T': student_t}) # In[49]: from bokeh.charts import Histogram from bokeh.charts import hplot hist = Histogram(distributions, bins=int(np.sqrt(len(normal))), notebook=True) bk.show(hplot(hist)) # Notice how we strung together methods for formatting the chart. # # Here is a scatter plot example, using the famous iris dataset. # In[58]: get_ipython().run_line_magic('pinfo', 'Scatter') # In[60]: from collections import OrderedDict from bokeh.charts import Scatter from bokeh.sampledata.iris import flowers scatter = Scatter(flowers, x="petal_length", y="petal_width", color="species") #scatter.title("iris dataset, dict_input").xlabel("petal_length").ylabel("petal_width").legend("top_left") bk.show(scatter) # # Interactivity with other packages # # Bokeh plots are compatible with other Python plotting packages, such as **Matplotlib**, **Seaborn**, and **ggplot**, via an onboard compatibility layer `bokeh.mpl`. This allows existing plots in these packages to be converted seamlessly into Bokeh. # In[ ]: titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv') # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') from ggplot import ggplot, aes, geom_density from bokeh import mpl import matplotlib.pyplot as plt g = ggplot(titanic, aes(x='age', color='pclass')) + geom_density() g.draw() plt.title("Plot of titanic passenger age distribution by class") mpl.to_bokeh(name="density") # In[ ]: import seaborn as sns # We generated random data titanic = titanic.dropna(subset=['age']) # And then just call the violinplot from Seaborn sns.violinplot(titanic.age, groupby=titanic.pclass, color="Set3") plt.title("Distribution of age by passenger class") mpl.to_bokeh(name="violin") # # Generating and displaying images # # Mandelbrot set images are made by sampling complex numbers and determining for each whether the result tends towards infinity when a particular mathematical operation is iterated on it. # # First, functions for generating the Mandelbrot set image. They create a 2D array of numbers, over which a color map can be displayed. # In[ ]: from __future__ import division def mandel(x, y, max_iters): """ Given the real and imaginary parts of a complex number, determine if it is a candidate for membership in the Mandelbrot set given a fixed number of iterations. """ c = complex(x, y) z = 0.0j for i in range(max_iters): z = z*z + c if (z.real*z.real + z.imag*z.imag) >= 4: return i return max_iters def create_fractal(min_x, max_x, min_y, max_y, image, iters): height = image.shape[0] width = image.shape[1] pixel_size_x = (max_x - min_x) / width pixel_size_y = (max_y - min_y) / height for x in range(width): real = min_x + x * pixel_size_x for y in range(height): imag = min_y + y * pixel_size_y color = mandel(real, imag, iters) image[y, x] = color # Define the bounding coordinates to generate the Mandelbrot image, then create a scalar image (2D array of numbers) # In[ ]: min_x = -2.0 max_x = 1.0 min_y = -1.0 max_y = 1.0 img = np.zeros((1024, 1536), dtype = np.uint8) create_fractal(min_x, max_x, min_y, max_y, img, 20) # Use `image` renderer to display Mandelbrot image, colormapped with "Spectral-11" color palette. The renderer can display many images at once, so it takes lists of images, coordinates, and palettes. # In[ ]: bk.image(image=[img], x=[min_x], y=[min_y], dw=[max_x-min_x], dh=[max_y-min_y], palette=["Spectral-11"], x_range = [min_x, max_x], y_range = [min_y, max_y], title="Mandelbrot", tools="pan,wheel_zoom,box_zoom,reset,previewsave", plot_width=900, plot_height=600 ) # In[ ]: bk.show() # # Complex Data Visualization # # In 1951, Will Burtin published a visualization to compare the effectiveness of three popular antibiotics on a suite of bacteria, measured in terms of minimum inhibitory concentration. # # ![burtin](http://www.americanscientist.org/Libraries/images/2009641416567334-2009-07MacroWainerFA.jpg) # # The *longer* the bar, the *smaller* the effective dose. The orange group is Gram-negative bacteria, while the purple is Gram-positive. # # We will attempt to re-create this plot in Bokeh. # # We start by defining the data and computing some derived quantities using NumPy and Pandas # In[ ]: from bokeh.objects import Range1d try: from StringIO import StringIO except ImportError: from io import StringIO from math import log, sqrt from collections import OrderedDict antibiotics = """ bacteria, penicillin, streptomycin, neomycin, gram Mycobacterium tuberculosis, 800, 5, 2, negative Salmonella schottmuelleri, 10, 0.8, 0.09, negative Proteus vulgaris, 3, 0.1, 0.1, negative Klebsiella pneumoniae, 850, 1.2, 1, negative Brucella abortus, 1, 2, 0.02, negative Pseudomonas aeruginosa, 850, 2, 0.4, negative Escherichia coli, 100, 0.4, 0.1, negative Salmonella (Eberthella) typhosa, 1, 0.4, 0.008, negative Aerobacter aerogenes, 870, 1, 1.6, negative Brucella antracis, 0.001, 0.01, 0.007, positive Streptococcus fecalis, 1, 1, 0.1, positive Staphylococcus aureus, 0.03, 0.03, 0.001, positive Staphylococcus albus, 0.007, 0.1, 0.001, positive Streptococcus hemolyticus, 0.001, 14, 10, positive Streptococcus viridans, 0.005, 10, 40, positive Diplococcus pneumoniae, 0.005, 11, 10, positive """ drug_color = OrderedDict([ ("Penicillin", "#0d3362"), ("Streptomycin", "#c64737"), ("Neomycin", "black" ), ]) gram_color = { "positive" : "#aeaeb8", "negative" : "#e69584", } df = pd.read_csv(StringIO(antibiotics), skiprows=1, skipinitialspace=True) width = 800 height = 800 inner_radius = 90 outer_radius = 300 - 10 minr = np.sqrt(log(.001 * 1E4)) maxr = np.sqrt(log(1000 * 1E4)) a = (outer_radius - inner_radius) / (minr - maxr) b = inner_radius - a * maxr def rad(mic): return a * np.sqrt(np.log(mic * 1E4)) + b big_angle = 2.0 * np.pi / (len(df) + 1) small_angle = big_angle / 7 # We are going to be combining several glyph renderers on to one plot, first we need to tell Bokeh to reuse the same plot using hold: # In[ ]: bk.figure() bk.hold(True) # Next we add the first glyph, the red and blue regions using `annular_wedge`. We also take this opportunity to set some of the overall properties of the plot: # In[ ]: x = np.zeros(len(df)) y = np.zeros(len(df)) angles = np.pi/2 - big_angle/2 - df.index.values*big_angle colors = [gram_color[gram] for gram in df.gram] bk.annular_wedge( x, y, inner_radius, outer_radius, -big_angle+angles, angles, color=colors, plot_width=width, plot_height=height, title="", tools="", x_axis_type=None, y_axis_type=None ) # Next we grab the current plot using `curplot` and customize the look of the plot further: # In[ ]: plot = bk.curplot() plot.x_range = Range1d(start=-420, end=420) plot.y_range = Range1d(start=-420, end=420) plot.min_border = 0 plot.background_fill = "#f0e1d2" plot.border_fill = "#f0e1d2" plot.outline_line_color = None bk.xgrid().grid_line_color = None bk.ygrid().grid_line_color = None # Add the small wedges representing the antibiotic effectiveness, also using `annular_wedge`: # In[ ]: bk.annular_wedge( x, y, inner_radius, rad(df.penicillin), -big_angle+angles + 5*small_angle, -big_angle+angles+6*small_angle, color=drug_color['Penicillin'], ) bk.annular_wedge( x, y, inner_radius, rad(df.streptomycin), -big_angle+angles + 3*small_angle, -big_angle+angles+4*small_angle, color=drug_color['Streptomycin'], ) bk.annular_wedge( x, y, inner_radius, rad(df.neomycin), -big_angle+angles + 1*small_angle, -big_angle+angles+2*small_angle, color=drug_color['Neomycin'], ) # Add circular and radial axes lines using `circle`, `text`, and `annular_wedge`: # In[ ]: labels = np.power(10.0, np.arange(-3, 4)) radii = a * np.sqrt(np.log(labels * 1E4)) + b bk.circle(x, y, radius=radii, fill_color=None, line_color="white") bk.text(x[:-1], radii[:-1], [str(r) for r in labels[:-1]], angle=0, text_font_size="8pt", text_align="center", text_baseline="middle") bk.annular_wedge( x, y, inner_radius-10, outer_radius+10, -big_angle+angles, -big_angle+angles, color="black", ) # Text labels for the bacteria using `text`: # In[ ]: xr = radii[0]*np.cos(np.array(-big_angle/2 + angles)) yr = radii[0]*np.sin(np.array(-big_angle/2 + angles)) label_angle = np.array(-big_angle/2+angles) label_angle[label_angle < -np.pi/2] += np.pi # easier to read labels on the left side bk.text(xr, yr, df.bacteria, angle=label_angle, text_font_size="9pt", text_align="center", text_baseline="middle") # Add custom legend # In[ ]: bk.circle([-40, -40], [-370, -390], color=list(gram_color.values()), radius=5) bk.text([-30, -30], [-370, -390], text=["Gram-" + x for x in gram_color.keys()], angle=0, text_font_size="7pt", text_align="left", text_baseline="middle") bk.rect([-40, -40, -40], [18, 0, -18], width=30, height=13, color=list(drug_color.values())) bk.text([-15, -15, -15], [18, 0, -18], text=list(drug_color.keys()), angle=0, text_font_size="9pt", text_align="left", text_baseline="middle") # In[ ]: bk.hold(False) bk.show() # # Abstract rendering # # Bokeh deals with large dataset via **abstract rendering**, which dynamic downsamples areas with high data density so that they can be rendered more efficiently. # # Abstract rendering is a bin-based rendering technique that provides greater control over a visual representation and access to larger data sets through server-side processing. Bokeh provides both a high-level and low-level interface with the abstract rendering engine. # # At a high level, all abstract rendering applications start with a plot. Abstract rendering takes the plot and renders it to a canvas that uses **data values** instead of colors and **bins** instead of pixels. With the data values collected into bins, the plot can be analyzed and transformed to ensure the true nature of the underlying data source is preserved. Bokeh then **shades** the bins to represent the underlying quantity of data. # # Abstract rendering is tied to the bokeh server infrastructure, and can thus only be used with an active bokeh server and with plots employing a ServerDataSource. # # The server can be started via: # # bokeh-server -D remotedata # # Bokeh's abstract rendering **recipes** provide simplified access to common abstract rendering operations. The recipes interface is declarative, in that a high-level operation is requested and the abstract rendering system constructs the proper low-level function combinations. # # ## Example: Heat map and contour plot of census tracts # # This example shows how abstract rendering is used to generate a dynamic heatmap. `heatmap` converts a plot of points into a plot of densities. The most common scenario is a recipes with one item type and overplotting issues. Adjacency-matrix based graph visualizations also benefit from a heatmap if there is more than one node per pixel row/col. The basic process is that each bin collects the count of the number of items that fall into the bin. After that, a color scale is constructed that ensures the full range of the data covers is covered by the color scale. # ### Note that these plots will not be displayed in the static notebook. They must be executed with a running server. # In[ ]: from bokeh.plotting import square, output_server, show from bokeh.objects import ServerDataSource import bokeh.transforms.ar_downsample as ar output_server("Census") # 2010 US Census tracts source = ServerDataSource(data_url="/defaultuser/CensusTracts.hdf5", owner_username="defaultuser") plot = square( 'LON', 'LAT', source=source, plot_width=600, plot_height=400, title="Census Tracts") ar.heatmap(plot, low=(255, 200, 200), points=True, title="Census Tracts (Server Colors)") show() # In[ ]: ar.contours(plot) # In[ ]: show() # --- # In[ ]: import warnings import IPython.core.formatters warnings.filterwarnings('ignore', category=IPython.core.formatters.FormatterWarning) from IPython.core.display import HTML def css_styling(): styles = open("styles/custom.css", "r").read() return HTML(styles) css_styling()