import pandas as pd
import glob
import plotly.plotly as py
from plotly.graph_objs import *

gray_light = '#bbbbbb'
gray_med = '#737373'
red_orange = '#ff3700'

# Import datasets for each year
sal11 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2011.csv',index_col=0)
sal12 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2012.csv',index_col=0)
sal13 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2013.csv',index_col=0,names=sal12.columns,skiprows=0)
sal14 = pd.read_csv('Baltimore_City_Employee_Salaries_FY2014.csv',index_col=0)

# For some awful reason the 2013 is labeled differently than 2012 and 2014
sal13['Agency'] = sal13['Agency'].str.replace(r'Police Department.+', 'Police Department ')

# Select only police department employees
all_police11 = sal11[sal11['Agency']=='Police Department']
all_police12 = sal12[sal12['Agency']=='Police Department ']
all_police13 = sal13[sal13['Agency']=='Police Department ']
all_police14 = sal14[sal14['Agency']=='Police Department ']


all_police11.head()

# List all job titles (2014)
BPD_job_titles = pd.DataFrame(zeros(len(all_police14.groupby('JobTitle').groups)),columns=['job_titles'])
BPD_gb_job_titles = all_police14.groupby('JobTitle')
for i, j in enumerate(BPD_gb_job_titles.groups):
    BPD_job_titles['job_titles'].ix[i] = j
    
BPD_job_titles.sort('job_titles',ascending=True,inplace=True)
list(BPD_job_titles['job_titles'])

def remove_dollar_sign(df_col):
    parsed_df_col = df_col.str.replace("$","").apply(lambda x: float(x))
    return parsed_df_col

def parse(df):
    df['PAnnualSalary'] = remove_dollar_sign(df['AnnualSalary'])
    df['PGrossPay'] = remove_dollar_sign(df['GrossPay'])
    df['PayRatio'] = df['PGrossPay'] / df['PAnnualSalary']
    df['PayDiff'] = df['PGrossPay'] - df['PAnnualSalary']
    #df = df[df['PayRatio']>=1]
    return df

def police_groups(all_police):
    officers = all_police[(all_police['JobTitle'] == 'POLICE OFFICER') | (all_police['JobTitle'] == 'POLICE OFFICER (EID)')]
    other_police= all_police[(all_police['JobTitle'] != 'POLICE OFFICER') & (all_police['JobTitle'] != 'POLICE OFFICER (EID)')]
    return officers, other_police

all_police11_full_salary_paid = parse(all_police11)
all_police12_full_salary_paid = parse(all_police12)
all_police13_full_salary_paid = parse(all_police13)
all_police14_full_salary_paid = parse(all_police14)

all_police_dfs = {'2011':all_police11_full_salary_paid,
                  '2012':all_police12_full_salary_paid,
                  '2013':all_police13_full_salary_paid,
                  '2014':all_police14_full_salary_paid}
field_names=['AvgSalary','AvgPayRatio','AvgPayDiff','TotalPayDiff','MaxPayDiff','MaxPayRatio','EmployeeCount']

'''this = all_police14
print len(this)
print len(this[this['PGrossPay']>=this['PAnnualSalary']])
print float(len(this[this['PGrossPay']>=this['PAnnualSalary']]))/float(len(this))

'''

'''
print all_police11['PayRatio'].median()
print all_police12['PayRatio'].median()
print all_police13['PayRatio'].median()
print all_police14['PayRatio'].median()
'''


def calc_row_metrics(workforce, year):   

    metrics_row = pd.DataFrame(zeros([1,len(field_names)]),columns=field_names)
    
    # average salary
    metrics_row['AvgSalary'] = workforce['PAnnualSalary'].mean()
    # average pay ratio
    metrics_row['AvgPayRatio'] = workforce['PayRatio'].mean()
    # average pay diff
    metrics_row['AvgPayDiff'] = workforce['PayDiff'].mean()
    # max pay ratio
    metrics_row['MaxPayRatio'] = workforce['PayRatio'].max()
    # total pay diff
    metrics_row['TotalPayDiff'] = workforce['PayDiff'].sum()
    # max pay diff
    metrics_row['MaxPayDiff'] = workforce['PayDiff'].max()
    # BPD employee count
    metrics_row['EmployeeCount'] = len(workforce)
    return metrics_row


def calc_some_metrics():

    index =  [2011,2012,2013,2014]
    all_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)    
    officers_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)
    other_police_metrics = pd.DataFrame(zeros([len(index),len(field_names)]), index=index, columns=field_names)
    
    for i, j in enumerate(all_police_dfs):
        
        officers, other_police = police_groups(all_police_dfs[str(j)])
        
        all_police_metrics.ix[float(j)] = calc_row_metrics(all_police_dfs[str(j)],j).ix[0]
        officers_metrics.ix[float(j)] = calc_row_metrics(officers,j).ix[0]
        other_police_metrics.ix[float(j)] = calc_row_metrics(other_police,j).ix[0]

    
    return all_police_metrics, officers_metrics, other_police_metrics
    

officers, other_police = police_groups(all_police_dfs[str(2012)])


all_police_metrics, officers_metrics, other_police_metrics = calc_some_metrics()

all_police_metrics

officers_metrics

other_police_metrics

# Looking at the police with the largest bonuses
Top20_11 = all_police11.sort('PayDiff', ascending=False)[:20]
Top20_12 = all_police12.sort('PayDiff', ascending=False)[:20]
Top20_13 = all_police13.sort('PayDiff', ascending=False)[:20]
Top20_14 = all_police14.sort('PayDiff', ascending=False)[:20]

# Metrics for the top 20
def calc_some_metrics_top20():
    
    Top20_dfs = {2011: Top20_11, 2012: Top20_12, 2013: Top20_13, 2014: Top20_14}
    Top20_metrics = pd.DataFrame(zeros([4,len(field_names)]), index=[2011,2012,2013,2014], columns=field_names)    
    
    for i, j in enumerate(Top20_dfs):
        Top20_metrics.ix[j] = calc_row_metrics(Top20_dfs[j],j).ix[0]
   
    return Top20_metrics

Top20_metrics = calc_some_metrics_top20()

Top20_metrics

Top20_11

other_police.sort('PayDiff',ascending=False)[:20]


a=sal14['PayDiff'].groupby(sal14['Agency']).get_group('Police Department ')
a[a>0].hist()

import plotly.plotly as py

#all_police14_full_salary_paid = all_police14[all_police14['PayRatio'] >= 1.0]
bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary'])

title = 'Baltimore Police <br> Salary and Bonus/Overtime in FY2014'

# slope equal to the median bonus % of salary - half of points are above line, half below
slope = bonus14_percent.median()

median_bonus = [[0,300000],[0,slope*300000]]

trace1 = Scatter(x=all_police14['PAnnualSalary'],
                 y=all_police14['PayDiff'],
                 text = all_police14['PayDiff'].index,
                 mode = 'markers',
                 marker = Marker(size = 5,
                                 sizemode = 'diameter',
                                 #sizeref = schoolage_pop.max()/1000,
                                 opacity = 0.4,
                                 color = 'gray')
                 )


trace2 = Scatter(x=median_bonus[0],
              y=median_bonus[1],
              text='',
              mode = 'lines',
              line=Line(color='black',width=1.0)
              )


trace3 = Scatter(x=[0,300000],
              y=[0,0],
              text='',
              mode = 'lines',
              line=Line(color=red_orange,width=2.0)
              )

trace4 = Scatter(x=[0,300000],
              y=[0,300000],
              text='',
              mode = 'lines',
              line=Line(color=red_orange,width=1.0)
              )

layout = Layout(
    title = title,
    xaxis = XAxis(range = [0,120000],
                  title = 'Salary',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = True,
                  showticklabels = True),
    yaxis = YAxis(range = [-100000,120000],
                  title = 'Bonus/Overtime Pay <br>',
                  showgrid = False,
                  showline = False,
                  zeroline = False,
                  autotick = True,
                  showticklabels = True),
    autosize = False,
    width=500,
    height=500,
    font = Font(size=14),
    hovermode = 'closest',
    showlegend=False,
    margin=Margin(
        l=75,
        r=75,
        b=50,
        t=50,
        pad=0)
)

data = Data([trace1,trace2,trace4])
fig = Figure(data=data, layout=layout)
py.iplot(fig, filename=title)

bonus14_percent = (all_police14['PayDiff']/all_police14['PAnnualSalary'])

fig = plt.figure(figsize=[16,3])
#print bonus14_percent.median()
#print bonus14_percent.mean()
bonus14_percent.hist(bins=50)


import numpy as np
from scipy.interpolate import UnivariateSpline
from matplotlib import pyplot as plt

fig = plt.figure(figsize=[16,3])
ax1 = plt.subplot(1,1,1)
N = 1000
n = N/10
for i, j in enumerate(all_police_dfs):
    s = np.array(all_police_dfs[str(j)]['PayDiff'].fillna(0))
    p, x = np.histogram(s, bins=n) # bin it into n = N/10 bins
    x = x[:-1] + (x[1] - x[0])/2   # convert bin edges to centers
    f = UnivariateSpline(x, p, s=n)
    plt.plot(x, f(x))

xlim([-20000,50000])    
legend(['2011','2012','2013','2014'])

# turn off square border around plot
ax1.spines["top"].set_visible(False)  
ax1.spines["bottom"].set_visible(False)  
ax1.spines["right"].set_visible(False)  
ax1.spines["left"].set_visible(False)

# turn off ticks
ax1.tick_params(axis="both", which="both", bottom="off", top="off",
               labelbottom="on", left="off", right="off", labelleft="on",labelsize=10)
ylim([0,600])
plt.show()


all_police14[all_police14['PAnnualSalary'] == 43999]

all_police14['Name'] = all_police14.index
all_police14['Last_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[0])
all_police14['First_Name'] = all_police14['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')

all_police13['Name'] = all_police13.index
all_police13['Last_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[0])
all_police13['First_Name'] = all_police13['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')

all_police12['Name'] = all_police12.index
all_police12['Last_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[0])
all_police12['First_Name'] = all_police12['Name'].apply(lambda x: x.split(',')[1] if len(x.split(','))>1 else '')


'''
police_roll_14 = pd.DataFrame(index=all_police14.index)
police_roll_14['First_Name'] = all_police14['First_Name']
police_roll_14['Last_Name'] = all_police14['Last_Name']
police_roll_14.to_csv('Police_Roll_2014.csv')

police_roll_12 = pd.DataFrame(index=all_police12.index)
police_roll_12['First_Name'] = all_police12['First_Name']
police_roll_12['Last_Name'] = all_police12['Last_Name']

police_roll_13 = pd.DataFrame(index=all_police13.index)
police_roll_13['First_Name'] = all_police13['First_Name']
police_roll_13['Last_Name'] = all_police13['Last_Name']
'''


all_police14[all_police14['Last_Name']=='Rice']

all_police14[all_police14['Last_Name']=='White']

all_police14[all_police14['Last_Name']=='Porter']

all_police14[all_police14['Last_Name']=='Miller']

all_police14[all_police14['Last_Name']=='Nero']

all_police14[all_police14['Last_Name']=='Goodson Jr']