#!/usr/bin/env python # coding: utf-8 # # # # # #

Bokeh Tutorial

# #

07. Bar and Categorical Data Plots

# In[ ]: from bokeh.io import show, output_notebook from bokeh.plotting import figure output_notebook() # ## Basic Bar Charts # # Bar charts are a common and important type of plot. Bokeh makes it simple to create all sorts of stacked or nested bar charts, and to deal with categorical data in general. # The example below shows a simple bar chart created using the `vbar` method for drawing vertical bars. (There is a corresponding `hbar` for horizontal bars.) We also set a few plot properties to make the chart look nicer, see chapter [Styling and Theming](02 - Styling and Theming.ipynb) for information about visual properties. # In[ ]: # Here is a list of categorical values (or factors) fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries'] # Set the x_range to the list of categories above p = figure(x_range=fruits, height=250, title="Fruit Counts") # Categorical values can also be used as coordinates p.vbar(x=fruits, top=[5, 3, 4, 2, 4, 6], width=0.9) # Set some properties to make the plot look better p.xgrid.grid_line_color = None p.y_range.start = 0 show(p) # When we want to create a plot with a categorical range, we pass the ordered list of categorical values to `figure`, e.g. `x_range=['a', 'b', 'c']`. In the plot above, we passed the list of fruits as `x_range`, and we can see those refelected as the x-axis. # # The `vbar` glyph method takes an `x` location for the center of the bar, a `top` and `bottom` (which defaults to 0), and a `width`. When we are using a categorical range as we are here, each category implicitly has width of 1, so setting `width=0.9` as we have done here makes the bars shrink away from each other. (Another option would be to add some padding to the range.) # In[ ]: # Exercise: Create your own simple bar chart # Since `vbar` is a glyph method, we can use it with a `ColumnDataSource` just as we would with any other glyph. In the example below, we put the data (including color data) in a `ColumnDataSource` and use that to drive our plot. We also add a legend, see chapter [Adding Annotations.ipynb](03 - Adding Annotations.ipynb) for more information about legends and other annotations. # In[ ]: from bokeh.models import ColumnDataSource from bokeh.palettes import Spectral6 fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries'] counts = [5, 3, 4, 2, 4, 6] source = ColumnDataSource(data=dict(fruits=fruits, counts=counts, color=Spectral6)) p = figure(x_range=fruits, height=250, y_range=(0, 9), title="Fruit Counts") p.vbar(x='fruits', top='counts', width=0.9, color='color', legend_field="fruits", source=source) p.xgrid.grid_line_color = None p.legend.orientation = "horizontal" p.legend.location = "top_center" show(p) # In[ ]: # Exercise: Create your own simple bar chart driven by a ColumnDataSource # ## Stacked Bars # # It's often desirable to stack bars together. Bokeh makes this straightforward using the `vbar_stack` and `hbar_stack` methods. When passing data to one of these methods, the data source should have a series for each "row" in the stack. You will provide an ordered list of column names to stack together from the data source. # # In the example below, we see simulated data for fruit exports (positive values) and imports (negative values) stacked using two calls to `hbar_stack`. The values in the columns for each year are ordered according to the `fruits`, i.e. this is not a "tidy" data format. # In[ ]: from bokeh.palettes import GnBu3, OrRd3 years = ['2015', '2016', '2017'] exports = {'fruits' : fruits, '2015' : [2, 1, 4, 3, 2, 4], '2016' : [5, 3, 4, 2, 4, 6], '2017' : [3, 2, 4, 4, 5, 3]} imports = {'fruits' : fruits, '2015' : [-1, 0, -1, -3, -2, -1], '2016' : [-2, -1, -3, -1, -2, -2], '2017' : [-1, -2, -1, 0, -2, -2]} p = figure(y_range=fruits, height=250, x_range=(-16, 16), title="Fruit import/export, by year") p.hbar_stack(years, y='fruits', height=0.9, color=GnBu3, source=ColumnDataSource(exports), legend_label=["%s exports" % x for x in years]) p.hbar_stack(years, y='fruits', height=0.9, color=OrRd3, source=ColumnDataSource(imports), legend_label=["%s imports" % x for x in years]) p.y_range.range_padding = 0.1 p.ygrid.grid_line_color = None p.legend.location = "center_left" show(p) # Notice we also added some padding *around* the categorical range (e.g. at both ends of the axis) by specifying # # ``` # p.y_range.range_padding = 0.1 # ``` # In[ ]: # Create a stacked bar chart with a single call to vbar_stack # ## Grouped Bar Charts # # Sometimes we want to group bars together, instead of stacking them. Bokeh can handle up to three levels of nested (hierarchical) categories, and will automatically group output according to the outermost level. To specify nested categorical coordinates, the columns of the data source should contain tuples, for example: # # x = [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ] # # Values in other columns correspond to each item in `x`, exactly as in other cases. When plotting with these kinds of nested coordinates, we must tell Bokeh the contents and order the axis range, by explicitly passing a `FactorRange` to `figure`. In the example below, this is seen as # # p = figure(x_range=FactorRange(*x), ....) # # In[ ]: from bokeh.models import FactorRange fruits = ['Apples', 'Pears', 'Nectarines', 'Plums', 'Grapes', 'Strawberries'] years = ['2015', '2016', '2017'] data = {'fruits' : fruits, '2015' : [2, 1, 4, 3, 2, 4], '2016' : [5, 3, 3, 2, 4, 6], '2017' : [3, 2, 4, 4, 5, 3]} # this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ] x = [ (fruit, year) for fruit in fruits for year in years ] counts = sum(zip(data['2015'], data['2016'], data['2017']), ()) # like an hstack source = ColumnDataSource(data=dict(x=x, counts=counts)) p = figure(x_range=FactorRange(*x), height=250, title="Fruit Counts by Year") p.vbar(x='x', top='counts', width=0.9, source=source) p.y_range.start = 0 p.x_range.range_padding = 0.1 p.xaxis.major_label_orientation = 1 p.xgrid.grid_line_color = None show(p) # In[ ]: # Exercise: Make the chart above have a different color for each year by adding colors to the ColumnDataSource # Another way we can set the color of the bars is to use a transform. We first saw some transforms in previous chapter [Data Sources and Transformations](04 - Data Sources and Transformations.ipynb). Here we use a new one `factor_cmap` that accepts a the name of a column to use for colormapping, as well as the palette and factors that define the color mapping. # # Additionally we can configure it to map just the sub-factors if desired. For instance in this case we don't want shade each `(fruit, year)` pair differently. Instead, we want to only shade based on the `year`. So we pass `start=1` and `end=2` to specify the slice range of each factor to use when colormapping. Then we pass the result as the `fill_color` value: # # ``` # fill_color=factor_cmap('x', palette=['firebrick', 'olive', 'navy'], factors=years, start=1, end=2)) # ``` # to have the colors be applied automatically based on the underlying data. # In[ ]: from bokeh.transform import factor_cmap p = figure(x_range=FactorRange(*x), height=250, title="Fruit Counts by Year") p.vbar(x='x', top='counts', width=0.9, source=source, line_color="white", # use the palette to colormap based on the the x[1:2] values fill_color=factor_cmap('x', palette="Bright3", factors=years, start=1, end=2)) p.y_range.start = 0 p.x_range.range_padding = 0.1 p.xaxis.major_label_orientation = 1 p.xgrid.grid_line_color = None show(p) # It is also possible to achieve grouped bar plots using another technique called "visual dodge". That would be useful e.g. if you only wanted to have the axis labeled by fruit type, and not include the years on the axis. This tutorial does not cover that technique but you can find information in the [User's Guide](https://bokeh.pydata.org/en/dev/docs/user_guide/categorical.html#visual-dodge). # ## Mixing Categorical Levels # # If you have created a range with nested categories as above, it is possible to plot glyphs using only the "outer" categories, if desired. The plot below shows monthly values grouped by quarter as bars. The data for these are in the familiar format: # # factors = [("Q1", "jan"), ("Q1", "feb"), ("Q1", "mar"), ....] # # The plot also overlays a line representing average quarterly values, and this is accomplished by using only the "quarter" part of each nexted category: # # p.line(x=["Q1", "Q2", "Q3", "Q4"], y=....) # In[ ]: factors = [("Q1", "jan"), ("Q1", "feb"), ("Q1", "mar"), ("Q2", "apr"), ("Q2", "may"), ("Q2", "jun"), ("Q3", "jul"), ("Q3", "aug"), ("Q3", "sep"), ("Q4", "oct"), ("Q4", "nov"), ("Q4", "dec")] p = figure(x_range=FactorRange(*factors), height=250) x = [ 10, 12, 16, 9, 10, 8, 12, 13, 14, 14, 12, 16 ] p.vbar(x=factors, top=x, width=0.9, alpha=0.5) qs, aves = ["Q1", "Q2", "Q3", "Q4"], [12, 9, 13, 14] p.line(x=qs, y=aves, color="red", line_width=3) p.circle(x=qs, y=aves, line_color="red", fill_color="white", size=10) p.y_range.start = 0 p.x_range.range_padding = 0.1 p.xgrid.grid_line_color = None show(p) # ## Using Pandas `GroupBy` # # We may want to make charts based on the results of "group by" operations. Bokeh can utilize Pandas `GroupBy` objects directly to make this simpler. Let's take a look at how Bokeh deals with `GroupBy` objects by examining the "cars" data set. # In[ ]: from bokeh.sampledata.autompg import autompg_clean as df df.cyl = df.cyl.astype(str) df.head() # Suppose we would like to display some values grouped according to `"cyl"`. If we create `df.groupby(('cyl'))` then call `group.describe()` we can see that Pandas automatically computes various statistics for each group. # In[ ]: group = df.groupby(('cyl')) group.describe() # Bokeh allows us to create a `ColumnDataSource` directly from Pandas `GroupBy` objects, and when this happens, the data source is automatically filled with the summary values from `group.desribe()`. Observe the column names below, which correspond to the output above. # In[ ]: source = ColumnDataSource(group) ",".join(source.column_names) # Knowing these column names, we can immediately create bar charts based on Pandas `GroupBy` objects. The example below plots the aveage MPG per cylinder, i.e. columns `"mpg_mean"` vs `"cyl"` # In[ ]: cyl_cmap = factor_cmap('cyl', palette="Spectral5", factors=sorted(df.cyl.unique())) p = figure(height=350, x_range=group) p.vbar(x='cyl', top='mpg_mean', width=0.9, line_color="white", fill_color=cyl_cmap, source=source) p.xgrid.grid_line_color = None p.xaxis.axis_label = "number of cylinders" p.yaxis.axis_label = "Mean MPG" p.y_range.start = 0 show(p) # In[ ]: # Exercise: Use the same dataset to make a similar plot of mean horsepower (hp) by origin # ## Categorical Scatterplots # # So far we have seen Categorical data used together with various bar glyphs. But Bokeh can use categorical coordinates for most any glyphs. Let's create a scatter plot with categorical coordinates on one axis. The `commits` data set simply has a series datetimes of GitHub commit. Additional columns to express the day and hour of day for each commit have already been added. # In[ ]: from bokeh.sampledata.commits import data data.head() # To create our scatter plot, we pass the list of categories as the range just as before # # p = figure(y_range=DAYS, ...) # # Then we can plot circles for each commit, with `"time"` driving the x-coordinate, and `"day"` driving the y-coordinate. # # p.circle(x='time', y='day', ...) # # To make the values more distinguishable, we can also add a `jitter` transform to the y-coordinate, which is shown in the complete example below. # In[ ]: from bokeh.transform import jitter DAYS = ['Sun', 'Sat', 'Fri', 'Thu', 'Wed', 'Tue', 'Mon'] source = ColumnDataSource(data) p = figure(width=800, height=300, y_range=DAYS, x_axis_type='datetime', title="Commits by Time of Day (US/Central) 2012—2016") p.circle(x='time', y=jitter('day', width=0.6, range=p.y_range), source=source, alpha=0.3) p.xaxis[0].formatter.days = '%Hh' p.x_range.range_padding = 0 p.ygrid.grid_line_color = None show(p) # In[ ]: # Exercise: Create a plot using categorical coordinates and any non-"bar" glyphs # # Next Section # Click on this link to go to the next notebook: [08 - Graph and Network Plots](08%20-%20Graph%20and%20Network%20Plots.ipynb). # # To go back to the overview, click [here](00%20-%20Introduction%20and%20Setup.ipynb). # In[ ]: # In[ ]: