import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from __future__ import division
import plotly.offline
import plotly.plotly as ply
import plotly.figure_factory as ff # generate table
from plotly.graph_objs import * # generate bar charts etc.
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
%matplotlib inline
init_notebook_mode(connected=True)
by Mark Follman, Gavin Aronsen and Deanna Pan
fname='MotherJones_MassShooting_1982-2018.csv'
df = pd.read_csv(fname,converters={"Fatalities":int})
# correct a few indices
df.loc[df['Venue'] == '\nWorkplace', 'Venue'] = 'Workplace'
df.loc[df['Venue'] == 'Other\n', 'Venue'] = 'Other'
df.groupby(['Venue'])['Fatalities'].agg({'Fatalities': ['sum']})
Fatalities | |
---|---|
sum | |
Venue | |
Airport | 5 |
Military | 38 |
Other | 381 |
Religious | 57 |
School | 162 |
Workplace | 173 |
#df.loc[df['Venue'] == 'Other']['Sources'][2]
sum_per_venue.index
Index([u'Airport', u'Military', u'Other', u'Religious', u'School', u'Workplace'], dtype='object', name=u'Venue')
sum_per_venue=df.groupby(['Venue'])['Fatalities'].agg(['sum'])
count_per_venue=df.groupby(['Venue']).agg('count')['Case']
sum_per_venue['count'] = count_per_venue
sum_per_venue['mean'] = np.around(sum_per_venue['sum']/count_per_venue,decimals=1)
sum_per_venue['Venue'] = sum_per_venue.index
sum_per_venue = sum_per_venue.reindex(columns=['Venue','sum','count','mean'])
table = ff.create_table(sum_per_venue,height_constant=30)
table.layout.width=700
#ply.iplot(table, filename='table1')
iplot(table, filename='table1')
#sum_per_venue.iloc[:,2].plot(kind='bar')
df_mj = sum_per_venue.drop(['Other'])
ctable = ['rgb(145,191,219)']*5
ctable[3] = 'rgba(222,45,38,0.8)'
width = list(np.full(5,0.5))
data = [Bar(x=df_mj['Venue'],
y=df_mj['mean'],marker=dict(color=ctable),width=width)]
layout = Layout(autosize=False,width=700,height=500, title='Mean # of Fatalities per Incident')
#margin=Margin(l=20,r=20,b=20,t=20,pad=4),
#paper_bgcolor='#7f7f7f',
#plot_bgcolor='#c7c7c7')
fig = Figure(data=data,layout=layout)
#ply.iplot(fig, filename='mean_fatalities_per_incidents')
iplot(fig, filename='mean_fatalities_per_incidents')
#sum_per_venue.plot.barh(y=['sum','count'],stacked='True',colormap='jet')
ctable1 = ['rgb(145,191,219)']*5
ctable1[3] = 'rgba(210,45,38,0.8)'
ctable2 = ['rgb(190,191,219)']*5
ctable2[3] = 'rgba(250,45,38,0.8)'
width = list(np.full(5,0.4))
data1 = Bar(x=df_mj['Venue'],
y=df_mj['sum'],marker=dict(color=ctable1),width=width,name='Sum')
data2 = Bar(x=df_mj['Venue'],
y=df_mj['count'],marker=dict(color=ctable2),width=width,name='Count')
layout = Layout(autosize=False,width=700,height=500, title='# of Mass Shootings and Total # of Fatalities')
#margin=Margin(l=20,r=20,b=20,t=20,pad=4),
#paper_bgcolor='#7f7f7f',
#plot_bgcolor='#c7c7c7')
data=[data2,data1]
fig = Figure(data=data,layout=layout)
#ply.iplot(fig, filename='count_sum_fatalities_by_venue')
iplot(fig, filename='count_sum_fatalities_by_venue')
fname='CDC_MassShooting_2009-2016.csv'
df_cdc=pd.read_csv(fname)
df_cdc_school=df_cdc.groupby('School')['Number Killed [CALCULATED]'].agg(['count','sum'])
df_cdc_school
count | sum | |
---|---|---|
School | ||
No | 150 | 783 |
Yes | 6 | 65 |
print(len(df[df['Venue']=='School']))
print(len(df_cdc[df_cdc['School']=='Yes']))
16 6
df_cdc[df_cdc['School']=='Yes']
Date | City | State | Listed in FBI SHR (2009-2012) | TOTAL SHOT (Not Including Shooter) | Number Killed [CALCULATED] | Number Injured | Females Killed | Males Killed | Children Killed (17 and Under) | ... | School | Workplace | Multiple | Other (Note) | Solely Home But Not DV [CALCULATED] | Red Flag | Subsequent FBI Terrorism Investigation | Public Space | Gun-Free Zone | Took place exclusively in private residence(s) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 4/3/09 | Binghamton | NY | Yes | 17 | 13 | 4 | 11 | 2 | 0 | ... | Yes | No | No | NaN | No | No | No | Yes | Yes | no |
65 | 4/2/12 | Oakland | CA | Yes | 7 | 7 | 0 | 6 | 1 | 0 | ... | Yes | No | No | NaN | No | No | No | Yes | Yes | no |
79 | 12/14/12 | Newtown | CT | No | 29 | 27 | 2 | 19 | 8 | 20 | ... | Yes | No | No | NaN | No | No | No | Yes | Yes | no |
89 | 6/7/13 | Santa Monica | CA | NaN | 9 | 5 | 4 | 2 | 3 | 0 | ... | Yes | No | Yes | home, road, school | No | No | No | Yes | Yes | no |
113 | 10/24/14 | Marysville | WA | NaN | 5 | 4 | 1 | 3 | 1 | 4 | ... | Yes | No | No | NaN | No | No | No | Yes | Yes | no |
136 | 10/1/15 | Roseberg | OR | NaN | 18 | 9 | 9 | 3 | 6 | 0 | ... | Yes | No | No | NaN | No | No | No | Yes | No | no |
6 rows × 49 columns
binary_df = pd.DataFrame(df_cdc_school)
binary_df.loc['Yes']=sum_per_venue.iloc[4]
binary_df['mean'] = binary_df['sum']/binary_df['count']
binary_df.round(1)
count | sum | mean | |
---|---|---|---|
School | |||
No | 150 | 783 | 5.2 |
Yes | 16 | 162 | 10.1 |
ctable = ['rgb(145,191,219)','rgba(222,45,38,0.8)']
width = list(np.full(2,0.5))
data = [Bar(y=['Not School','School'],x=binary_df['mean'],orientation='h',marker=dict(color=ctable),width=width)]
layout = Layout(autosize=False,width=700,height=500, title='Mean # of Fatalities per Incident: School vs. Not School')
#margin=Margin(l=20,r=20,b=20,t=20,pad=4),
#paper_bgcolor='#7f7f7f',
#plot_bgcolor='#c7c7c7')
fig = Figure(data=data,layout=layout)
#ply.iplot(fig, filename='mean_fatalities_school_no_school')
iplot(fig, filename='mean_fatalities_school_no_school')