import pandas as pd
import glob
import plotly.plotly as py
from plotly.graph_objs import *
gray_light = '#bbbbbb'
gray_med = '#737373'
red_orange = '#ff3700'
# Import datasets for each year
sal11 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2011.csv',index_col=0)
sal12 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2012.csv',index_col=0)
sal13 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2013.csv',index_col=0,names=sal12.columns,skiprows=0)
sal14 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2014.csv',index_col=0)
# For some awful reason the 2013 is labeled differently than 2012 and 2014
sal13['Agency'] = sal13['Agency'].str.replace(r'Police Department.+', 'Police Department ')
# Select only police department employees
all_police11 = sal11[sal11['Agency']=='Police Department']
all_police12 = sal12[sal12['Agency']=='Police Department ']
all_police13 = sal13[sal13['Agency']=='Police Department ']
all_police14 = sal14[sal14['Agency']=='Police Department ']
all_police11.head()
# List all job titles (2014)
BPD_job_titles = pd.DataFrame(zeros(len(all_police14.groupby('JobTitle').groups)),columns=['job_titles'])
BPD_gb_job_titles = all_police14.groupby('JobTitle')
for i, j in enumerate(BPD_gb_job_titles.groups):
BPD_job_titles['job_titles'].ix[i] = j
BPD_job_titles.sort('job_titles',ascending=True,inplace=True)
list(BPD_job_titles['job_titles'])
def remove_dollar_sign(df_col):
parsed_df_col = df_col.str.replace("$","").apply(lambda x: float(x))
return parsed_df_col
def parse(df):
df['PAnnualSalary'] = remove_dollar_sign(df['AnnualSalary'])
df['PGrossPay'] = remove_dollar_sign(df['GrossPay'])
df['PayRatio'] = df['PGrossPay'] / df['PAnnualSalary']
df['PayDiff'] = df['PGrossPay'] - df['PAnnualSalary']
#df = df[df['PayRatio']>=1]
return df
def police_groups(all_police):
officers = all_police[(all_police['JobTitle'] == 'POLICE OFFICER') | (all_police['JobTitle'] == 'POLICE OFFICER (EID)')]
other_police= all_police[(all_police['JobTitle'] != 'POLICE OFFICER') & (all_police['JobTitle'] != 'POLICE OFFICER (EID)')]
return officers, other_police
all_police11_full_salary_paid = parse(all_police11)
all_police12_full_salary_paid = parse(all_police12)
all_police13_full_salary_paid = parse(all_police13)
all_police14_full_salary_paid = parse(all_police14)
all_police_dfs = {'2011':all_police11_full_salary_paid,
'2012':all_police12_full_salary_paid,
'2013':all_police13_full_salary_paid,
'2014':all_police14_full_salary_paid}
field_names=['AvgSalary','AvgPayRatio','AvgPayDiff','TotalPayDiff','MaxPayDiff','MaxPayRatio','EmployeeCount']
'''this = all_police14
print len(this)
print len(this[this['PGrossPay']>=this['PAnnualSalary']])
print float(len(this[this['PGrossPay']>=this['PAnnualSalary']]))/float(len(this))
'''
'''
print all_police11['PayRatio'].median()
print all_police12['PayRatio'].median()
print all_police13['PayRatio'].median()
print all_police14['PayRatio'].median()
'''
def calc_row_metrics(workforce, year):
metrics_row = pd.DataFrame(zeros([1,len(field_names)]),columns=field_names)
# average salary
metrics_row['AvgSalary'] = workforce['PAnnualSalary'].mean()
# average pay ratio
metrics_row['AvgPayRatio'] = workforce['PayRatio'].mean()
# average pay diff
metrics_row['AvgPayDiff'] = workforce['PayDiff'].mean()
# max pay ratio
metrics_row['MaxPayRatio'] = workforce['PayRatio'].max()
# total pay diff
metrics_row['TotalPayDiff'] = workforce['PayDiff'].sum()
# max pay diff
metrics_row['MaxPayDiff'] = workforce['PayDiff'].max()
# BPD employee count
metrics_row['EmployeeCount'] = len(workforce)
return metrics_row
def calc_some_metrics():
index = [2011,2012,2013,2014]
all_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)
officers_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)
other_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)
for i, j in enumerate(all_police_dfs):
officers, other_police = police_groups(all_police_dfs[str(j)])
all_police_metrics.ix[float(j)] = calc_row_metrics(all_police_dfs[str(j)],j).ix[0]
officers_metrics.ix[float(j)] = calc_row_metrics(officers,j).ix[0]
other_police_metrics.ix[float(j)] = calc_row_metrics(other_police,j).ix[0]
return all_police_metrics, officers_metrics, other_police_metrics
officers, other_police = police_groups(all_police_dfs[str(2012)])
all_police_metrics, officers_metrics, other_police_metrics = calc_some_metrics()
all_police_metrics
officers_metrics
other_police_metrics
# Looking at the police with the largest bonuses
Top20_11 = all_police11.sort('PayDiff', ascending=False)[:20]
Top20_12 = all_police12.sort('PayDiff', ascending=False)[:20]
Top20_13 = all_police13.sort('PayDiff', ascending=False)[:20]
Top20_14 = all_police14.sort('PayDiff', ascending=False)[:20]
# Metrics for the top 20
def calc_some_metrics_top20():
Top20_dfs = {2011: Top20_11, 2012: Top20_12, 2013: Top20_13, 2014: Top20_14}
Top20_metrics = pd.DataFrame(zeros([4,len(field_names)]), index=[2011,2012,2013,2014], columns=field_names)
for i, j in enumerate(Top20_dfs):
Top20_metrics.ix[j] = calc_row_metrics(Top20_dfs[j],j).ix[0]
return Top20_metrics
Top20_metrics = calc_some_metrics_top20()
Top20_metrics
Top20_11
other_police.sort('PayDiff',ascending=False)[:20]
a=sal14['PayDiff'].groupby(sal14['Agency']).get_group('Police Department ')
a[a>0].hist()
import plotly.plotly as py
#all_police14_full_salary_paid = all_police14[all_police14['PayRatio'] >= 1.0]
bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary'])
title = 'Baltimore Police
Salary and Bonus/Overtime in FY2014'
# slope equal to the median bonus % of salary - half of points are above line, half below
slope = bonus14_percent.median()
median_bonus = [[0,300000],[0,slope*300000]]
trace1 = Scatter(x=all_police14['PAnnualSalary'],
y=all_police14['PayDiff'],
text = all_police14['PayDiff'].index,
mode = 'markers',
marker = Marker(size = 5,
sizemode = 'diameter',
#sizeref = schoolage_pop.max()/1000,
opacity = 0.4,
color = 'gray')
)
trace2 = Scatter(x=median_bonus[0],
y=median_bonus[1],
text='',
mode = 'lines',
line=Line(color='black',width=1.0)
)
trace3 = Scatter(x=[0,300000],
y=[0,0],
text='',
mode = 'lines',
line=Line(color=red_orange,width=2.0)
)
trace4 = Scatter(x=[0,300000],
y=[0,300000],
text='',
mode = 'lines',
line=Line(color=red_orange,width=1.0)
)
layout = Layout(
title = title,
xaxis = XAxis(range = [0,120000],
title = 'Salary',
showgrid = False,
showline = False,
zeroline = False,
autotick = True,
showticklabels = True),
yaxis = YAxis(range = [-100000,120000],
title = 'Bonus/Overtime Pay
',
showgrid = False,
showline = False,
zeroline = False,
autotick = True,
showticklabels = True),
autosize = False,
width=500,
height=500,
font = Font(size=14),
hovermode = 'closest',
showlegend=False,
margin=Margin(
l=75,
r=75,
b=50,
t=50,
pad=0)
)
data = Data([trace1,trace2,trace4])
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename=title)
bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary'])
fig = plt.figure(figsize=[16,3])
#print bonus14_percent.median()
#print bonus14_percent.mean()
bonus14_percent.hist(bins=50)
import numpy as np
from scipy.interpolate import UnivariateSpline
from matplotlib import pyplot as plt
fig = plt.figure(figsize=[16,3])
ax1 = plt.subplot(1,1,1)
N = 1000
n = N/10
for i, j in enumerate(all_police_dfs):
s = np.array(all_police_dfs[str(j)]['PayDiff'].fillna(0))
p, x = np.histogram(s, bins=n) # bin it into n = N/10 bins
x = x[:-1] + (x[1] - x[0])/2 # convert bin edges to centers
f = UnivariateSpline(x, p, s=n)
plt.plot(x, f(x))
xlim([-20000,50000])
legend(['2011','2012','2013','2014'])
# turn off square border around plot
ax1.spines["top"].set_visible(False)
ax1.spines["bottom"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax1.spines["left"].set_visible(False)
# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on",labelsize=10)
ylim([0,600])
plt.show()
all_police14[all_police14['PAnnualSalary'] == 43999]
all_police14['Name'] = all_police14.index
all_police14['Last_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[0])
all_police14['First_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')
all_police13['Name'] = all_police13.index
all_police13['Last_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[0])
all_police13['First_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')
all_police12['Name'] = all_police12.index
all_police12['Last_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[0])
all_police12['First_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')
'''
police_roll_14 = pd.DataFrame(index=all_police14.index)
police_roll_14['First_Name'] = all_police14['First_Name']
police_roll_14['Last_Name'] = all_police14['Last_Name']
police_roll_14.to_csv('Police_Roll_2014.csv')
police_roll_12 = pd.DataFrame(index=all_police12.index)
police_roll_12['First_Name'] = all_police12['First_Name']
police_roll_12['Last_Name'] = all_police12['Last_Name']
police_roll_13 = pd.DataFrame(index=all_police13.index)
police_roll_13['First_Name'] = all_police13['First_Name']
police_roll_13['Last_Name'] = all_police13['Last_Name']
'''
all_police14[all_police14['Last_Name']=='Rice']
all_police14[all_police14['Last_Name']=='White']
all_police14[all_police14['Last_Name']=='Porter']
all_police14[all_police14['Last_Name']=='Miller']
all_police14[all_police14['Last_Name']=='Nero']
all_police14[all_police14['Last_Name']=='Goodson Jr']