First, load up the data.
import sqlite3
import pandas as pd
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import Bar, Scatter, Figure, Layout
init_notebook_mode()
# start connection to sqlite data, grab data, then close connection
con = sqlite3.connect('data.db')
tags = pd.read_sql_query('SELECT * from tags;', con)
cities = pd.read_sql_query('SELECT * from cities;', con)
con.close()
# first, get the overall frequency of each tag
tagfrequency = tags.tag.value_counts()
# make tag frequency a dataframe, clean it up
tagfrequency = tagfrequency.to_frame(name = 'frequency')
tagfrequency['city'] = tagfrequency.index
tagfrequency.reset_index(inplace = True, drop = True)
# second, merge the tag frequency with each city in the database
merged = pd.merge(cities, tagfrequency, on = 'city')
merged['ratio'] = merged.frequency / (merged.population / 100000.0)
sh = Scatter(x = merged.population,
y = merged.frequency,
mode = 'markers',
text = merged.city,
hoverinfo = 'text',
marker = dict(
size = 12,
color = 'rgba(0, 0, 152, .5)',
)
)
layout = Layout(
xaxis = dict(
type='log',
title = 'Population (Log Scale)'
),
yaxis = dict(title = 'What.CD Tag Frequency'),
hovermode = 'closest',
showlegend=False
)
fh = Figure(data=[sh], layout = layout)
iplot(fh)
Unsurprisingly, the biggest cities are also the ones that are most frequently tagged (though note Los Angeles has a particularly poor showing: Houston, a city 1/3 its size, is making considerably more hip hop). The real question is about which cities make a surprising amount of hip hop, given their population:
merged = merged.sort_values(by = 'ratio', ascending = False)
# plot scatter of ratio against population
plotdata = merged.iloc[0:50,:]
sh = Scatter(
x = plotdata.population,
y = plotdata.ratio,
mode = 'markers',
text = plotdata.city,
hoverinfo = 'text',
marker = dict(
size = 12, color = 'rgba(0, 0, 152, .5)',
)
)
# plot bars for top 15 cities
plotdata = merged.iloc[0:15,:]
bh = Bar(
x = plotdata.city,
y = plotdata.ratio,
hoverinfo = 'x+y'
)
# layout
fh = tools.make_subplots(rows = 1, cols = 2,
shared_yaxes = True,
horizontal_spacing = 0.02,
subplot_titles = ['Top 50 Cities','Top 10 Cities'])
fh['layout']['showlegend'] = False
fh['layout']['hovermode'] = 'closest'
fh['layout']['xaxis1'].update(title='2010 Population (Log Scale)', type='log')
fh['layout']['yaxis'].update(title='Tags Per 100k People')
fh.append_trace(sh,1,1)
fh.append_trace(bh,1,2)
iplot(fh)
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (1,2) x2,y1 ]
Cash Money Records has been super prolific! Per person, New Orleans is leader in hip hop, with Atlanta not too far behind. Overall, the ranking is dominated by big (but not huge) cities.