import pandas as pd import glob import plotly.plotly as py from plotly.graph_objs import * gray_light = '#bbbbbb' gray_med = '#737373' red_orange = '#ff3700' # Import datasets for each year sal11 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2011.csv',index_col=0) sal12 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2012.csv',index_col=0) sal13 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2013.csv',index_col=0,names=sal12.columns,skiprows=0) sal14 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2014.csv',index_col=0) # For some awful reason the 2013 is labeled differently than 2012 and 2014 sal13['Agency'] = sal13['Agency'].str.replace(r'Police Department.+', 'Police Department ') # Select only police department employees all_police11 = sal11[sal11['Agency']=='Police Department'] all_police12 = sal12[sal12['Agency']=='Police Department '] all_police13 = sal13[sal13['Agency']=='Police Department '] all_police14 = sal14[sal14['Agency']=='Police Department '] all_police11.head() # List all job titles (2014) BPD_job_titles = pd.DataFrame(zeros(len(all_police14.groupby('JobTitle').groups)),columns=['job_titles']) BPD_gb_job_titles = all_police14.groupby('JobTitle') for i, j in enumerate(BPD_gb_job_titles.groups): BPD_job_titles['job_titles'].ix[i] = j BPD_job_titles.sort('job_titles',ascending=True,inplace=True) list(BPD_job_titles['job_titles']) def remove_dollar_sign(df_col): parsed_df_col = df_col.str.replace("$","").apply(lambda x: float(x)) return parsed_df_col def parse(df): df['PAnnualSalary'] = remove_dollar_sign(df['AnnualSalary']) df['PGrossPay'] = remove_dollar_sign(df['GrossPay']) df['PayRatio'] = df['PGrossPay'] / df['PAnnualSalary'] df['PayDiff'] = df['PGrossPay'] - df['PAnnualSalary'] #df = df[df['PayRatio']>=1] return df def police_groups(all_police): officers = all_police[(all_police['JobTitle'] == 'POLICE OFFICER') | (all_police['JobTitle'] == 'POLICE OFFICER (EID)')] other_police= all_police[(all_police['JobTitle'] != 'POLICE OFFICER') & (all_police['JobTitle'] != 'POLICE OFFICER (EID)')] return officers, other_police all_police11_full_salary_paid = parse(all_police11) all_police12_full_salary_paid = parse(all_police12) all_police13_full_salary_paid = parse(all_police13) all_police14_full_salary_paid = parse(all_police14) all_police_dfs = {'2011':all_police11_full_salary_paid, '2012':all_police12_full_salary_paid, '2013':all_police13_full_salary_paid, '2014':all_police14_full_salary_paid} field_names=['AvgSalary','AvgPayRatio','AvgPayDiff','TotalPayDiff','MaxPayDiff','MaxPayRatio','EmployeeCount'] '''this = all_police14 print len(this) print len(this[this['PGrossPay']>=this['PAnnualSalary']]) print float(len(this[this['PGrossPay']>=this['PAnnualSalary']]))/float(len(this)) ''' ''' print all_police11['PayRatio'].median() print all_police12['PayRatio'].median() print all_police13['PayRatio'].median() print all_police14['PayRatio'].median() ''' def calc_row_metrics(workforce, year): metrics_row = pd.DataFrame(zeros([1,len(field_names)]),columns=field_names) # average salary metrics_row['AvgSalary'] = workforce['PAnnualSalary'].mean() # average pay ratio metrics_row['AvgPayRatio'] = workforce['PayRatio'].mean() # average pay diff metrics_row['AvgPayDiff'] = workforce['PayDiff'].mean() # max pay ratio metrics_row['MaxPayRatio'] = workforce['PayRatio'].max() # total pay diff metrics_row['TotalPayDiff'] = workforce['PayDiff'].sum() # max pay diff metrics_row['MaxPayDiff'] = workforce['PayDiff'].max() # BPD employee count metrics_row['EmployeeCount'] = len(workforce) return metrics_row def calc_some_metrics(): index = [2011,2012,2013,2014] all_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names) officers_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names) other_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names) for i, j in enumerate(all_police_dfs): officers, other_police = police_groups(all_police_dfs[str(j)]) all_police_metrics.ix[float(j)] = calc_row_metrics(all_police_dfs[str(j)],j).ix[0] officers_metrics.ix[float(j)] = calc_row_metrics(officers,j).ix[0] other_police_metrics.ix[float(j)] = calc_row_metrics(other_police,j).ix[0] return all_police_metrics, officers_metrics, other_police_metrics officers, other_police = police_groups(all_police_dfs[str(2012)]) all_police_metrics, officers_metrics, other_police_metrics = calc_some_metrics() all_police_metrics officers_metrics other_police_metrics # Looking at the police with the largest bonuses Top20_11 = all_police11.sort('PayDiff', ascending=False)[:20] Top20_12 = all_police12.sort('PayDiff', ascending=False)[:20] Top20_13 = all_police13.sort('PayDiff', ascending=False)[:20] Top20_14 = all_police14.sort('PayDiff', ascending=False)[:20] # Metrics for the top 20 def calc_some_metrics_top20(): Top20_dfs = {2011: Top20_11, 2012: Top20_12, 2013: Top20_13, 2014: Top20_14} Top20_metrics = pd.DataFrame(zeros([4,len(field_names)]), index=[2011,2012,2013,2014], columns=field_names) for i, j in enumerate(Top20_dfs): Top20_metrics.ix[j] = calc_row_metrics(Top20_dfs[j],j).ix[0] return Top20_metrics Top20_metrics = calc_some_metrics_top20() Top20_metrics Top20_11 other_police.sort('PayDiff',ascending=False)[:20] a=sal14['PayDiff'].groupby(sal14['Agency']).get_group('Police Department ') a[a>0].hist() import plotly.plotly as py #all_police14_full_salary_paid = all_police14[all_police14['PayRatio'] >= 1.0] bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary']) title = 'Baltimore Police
Salary and Bonus/Overtime in FY2014' # slope equal to the median bonus % of salary - half of points are above line, half below slope = bonus14_percent.median() median_bonus = [[0,300000],[0,slope*300000]] trace1 = Scatter(x=all_police14['PAnnualSalary'], y=all_police14['PayDiff'], text = all_police14['PayDiff'].index, mode = 'markers', marker = Marker(size = 5, sizemode = 'diameter', #sizeref = schoolage_pop.max()/1000, opacity = 0.4, color = 'gray') ) trace2 = Scatter(x=median_bonus[0], y=median_bonus[1], text='', mode = 'lines', line=Line(color='black',width=1.0) ) trace3 = Scatter(x=[0,300000], y=[0,0], text='', mode = 'lines', line=Line(color=red_orange,width=2.0) ) trace4 = Scatter(x=[0,300000], y=[0,300000], text='', mode = 'lines', line=Line(color=red_orange,width=1.0) ) layout = Layout( title = title, xaxis = XAxis(range = [0,120000], title = 'Salary', showgrid = False, showline = False, zeroline = False, autotick = True, showticklabels = True), yaxis = YAxis(range = [-100000,120000], title = 'Bonus/Overtime Pay
', showgrid = False, showline = False, zeroline = False, autotick = True, showticklabels = True), autosize = False, width=500, height=500, font = Font(size=14), hovermode = 'closest', showlegend=False, margin=Margin( l=75, r=75, b=50, t=50, pad=0) ) data = Data([trace1,trace2,trace4]) fig = Figure(data=data, layout=layout) py.iplot(fig, filename=title) bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary']) fig = plt.figure(figsize=[16,3]) #print bonus14_percent.median() #print bonus14_percent.mean() bonus14_percent.hist(bins=50) import numpy as np from scipy.interpolate import UnivariateSpline from matplotlib import pyplot as plt fig = plt.figure(figsize=[16,3]) ax1 = plt.subplot(1,1,1) N = 1000 n = N/10 for i, j in enumerate(all_police_dfs): s = np.array(all_police_dfs[str(j)]['PayDiff'].fillna(0)) p, x = np.histogram(s, bins=n) # bin it into n = N/10 bins x = x[:-1] + (x[1] - x[0])/2 # convert bin edges to centers f = UnivariateSpline(x, p, s=n) plt.plot(x, f(x)) xlim([-20000,50000]) legend(['2011','2012','2013','2014']) # turn off square border around plot ax1.spines["top"].set_visible(False) ax1.spines["bottom"].set_visible(False) ax1.spines["right"].set_visible(False) ax1.spines["left"].set_visible(False) # turn off ticks ax1.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on",labelsize=10) ylim([0,600]) plt.show() all_police14[all_police14['PAnnualSalary'] == 43999] all_police14['Name'] = all_police14.index all_police14['Last_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[0]) all_police14['First_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '') all_police13['Name'] = all_police13.index all_police13['Last_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[0]) all_police13['First_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '') all_police12['Name'] = all_police12.index all_police12['Last_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[0]) all_police12['First_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '') ''' police_roll_14 = pd.DataFrame(index=all_police14.index) police_roll_14['First_Name'] = all_police14['First_Name'] police_roll_14['Last_Name'] = all_police14['Last_Name'] police_roll_14.to_csv('Police_Roll_2014.csv') police_roll_12 = pd.DataFrame(index=all_police12.index) police_roll_12['First_Name'] = all_police12['First_Name'] police_roll_12['Last_Name'] = all_police12['Last_Name'] police_roll_13 = pd.DataFrame(index=all_police13.index) police_roll_13['First_Name'] = all_police13['First_Name'] police_roll_13['Last_Name'] = all_police13['Last_Name'] ''' all_police14[all_police14['Last_Name']=='Rice'] all_police14[all_police14['Last_Name']=='White'] all_police14[all_police14['Last_Name']=='Porter'] all_police14[all_police14['Last_Name']=='Miller'] all_police14[all_police14['Last_Name']=='Nero'] all_police14[all_police14['Last_Name']=='Goodson Jr']