#!/usr/bin/env python # coding: utf-8 # **Table of Contents** # #
# # # Load modules and collect data # - use `sqlite3` (database small enough to be held in memory) # In[1]: import os from os.path import dirname, join import numpy as np import pandas.io.sql as psql import sqlite3 as sql from bokeh.sampledata.movies_data import movie_path __file__ = os.path.expanduser('~/git_local/bokeh_original/examples/app/movies/main.py') conn = sql.connect(movie_path) query = open(join(dirname(__file__), 'query.sql')).read() movies = psql.read_sql(query, conn) # In[2]: print query # In[3]: movies.head(n=3) # In[4]: movies.describe() # In[5]: movies.shape # In[6]: conn # In[7]: movies["color"] = np.where(movies["Oscars"] > 0, "orange", "grey") movies["alpha"] = np.where(movies["Oscars"] > 0, 0.9, 0.25) # In[8]: movies.columns # In[9]: movies[['color','alpha']].T # # handle null values # In[10]: movies.isnull().sum() # In[11]: movies.fillna(0, inplace=True) # just replace missing values with zero movies.isnull().sum() # In[12]: movies["revenue"] = movies.BoxOffice.apply(lambda x: '{:,d}'.format(int(x))) # In[13]: movies['revenue'] # In[14]: with open(join(dirname(__file__), "razzies-clean.csv")) as f: razzies = f.read().splitlines() # In[15]: movies.loc[movies.imdbID.isin(razzies), "color"] = "purple" movies.loc[movies.imdbID.isin(razzies), "alpha"] = 0.9 axis_map = { "Tomato Meter": "Meter", "Numeric Rating": "numericRating", "Number of Reviews": "Reviews", "Box Office (dollars)": "BoxOffice", "Length (minutes)": "Runtime", "Year": "Year", } # # now let's work with Bokeh # In[16]: from bokeh.plotting import figure from bokeh.layouts import layout, widgetbox from bokeh.models import ColumnDataSource, HoverTool, Div from bokeh.models.widgets import Slider, Select, TextInput from bokeh.io import curdoc # In[17]: desc = Div(text=open(join(dirname(__file__), "description.html")).read(), width=800) # ## Create Input Controls # In[18]: reviews = Slider(title="Minimum number of reviews", value=80, start=10, end=300, step=10) min_year = Slider(title="Year released", start=1940, end=2014, value=1970, step=1) max_year = Slider(title="End Year released", start=1940, end=2014, value=2014, step=1) oscars = Slider(title="Minimum number of Oscar wins", start=0, end=4, value=0, step=1) boxoffice = Slider(title="Dollars at Box Office (millions)", start=0, end=800, value=0, step=1) genre = Select(title="Genre", value="All", options=open(join(dirname(__file__), 'genres.txt')).read().split()) director = TextInput(title="Director name contains") cast = TextInput(title="Cast names contains") x_axis = Select(title="X Axis", options=sorted(axis_map.keys()), value="Tomato Meter") y_axis = Select(title="Y Axis", options=sorted(axis_map.keys()), value="Number of Reviews") # ## Create Column Data Source that will be used by the plot # In[19]: source = ColumnDataSource(data=dict(x=[], y=[], color=[], title=[], year=[], revenue=[], alpha=[])) # In[20]: hover = HoverTool(tooltips=[ ("Title", "@title"), ("Year", "@year"), ("$", "@revenue") ]) # In[21]: p = figure(plot_height=600, plot_width=700, title="", toolbar_location=None, tools=[hover]) p.circle(x="x", y="y", source=source, size=7, color="color", line_color=None, fill_alpha="alpha") # In[22]: def select_movies(): genre_val = genre.value director_val = director.value.strip() cast_val = cast.value.strip() selected = movies[ (movies.Reviews >= reviews.value) & (movies.BoxOffice >= (boxoffice.value * 1e6)) & (movies.Year >= min_year.value) & (movies.Year <= max_year.value) & (movies.Oscars >= oscars.value) ] if (genre_val != "All"): selected = selected[selected.Genre.str.contains(genre_val)==True] if (director_val != ""): selected = selected[selected.Director.str.contains(director_val)==True] if (cast_val != ""): selected = selected[selected.Cast.str.contains(cast_val)==True] return selected # In[23]: def update(): df = select_movies() x_name = axis_map[x_axis.value] y_name = axis_map[y_axis.value] p.xaxis.axis_label = x_axis.value p.yaxis.axis_label = y_axis.value p.title.text = "%d movies selected" % len(df) source.data = dict( x=df[x_name], y=df[y_name], color=df["color"], title=df["Title"], year=df["Year"], revenue=df["revenue"], alpha=df["alpha"], ) # In[24]: controls = [reviews, boxoffice, genre, min_year, max_year, oscars, director, cast, x_axis, y_axis] for control in controls: control.on_change('value', lambda attr, old, new: update()) sizing_mode = 'fixed' # 'scale_width' also looks nice with this example inputs = widgetbox(*controls, sizing_mode=sizing_mode) l = layout([ [desc], [inputs, p], ], sizing_mode=sizing_mode) # In[25]: update() # initial load of the data # In[26]: curdoc().add_root(l) curdoc().title = "Movies" # In[ ]: