#!/usr/bin/env python # coding: utf-8 # # Auditing Federal Contractors Part II # _By [Leon Yin](leonyin.org) Last Updated 2017-06-11_ # # View this notebook in [NBViewer](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) or [Github](https://github.com/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) # ## Analysis Methods # After downloading data from CCA from the [part 1](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/0_get_data.ipynb) of this module, we can analyze it using Python Pandas, and Matplotlib. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import glob import pandas as pd import matplotlib.pyplot as plt # In[9]: data_in = 'data_in/spending_corrections_corporation_of_america.tsv.gz' df = pd.read_csv(data_in, sep='\t', compression='gzip') # Recall, this is what the data looks like: # In[10]: df.head(3) # We have access to fast counting anf crunching in Pandas: # In[14]: print("CoreCivic recieved \${:,.2f} since 2000".format( df['dollarsobligated'].sum())) # We can perform aggragate funcions to get annual dollars obligated # In[13]: df.groupby('contract_year')['dollarsobligated'].sum().plot( kind='bar', title='Annual USD Contract Obligations to CoreCivic' ).set_xlabel('Contract Year'); # Let's make these plots Seaborn-cool # In[73]: plt.style.use('seaborn') # In[75]: df.groupby('contract_year')['dollarsobligated'].sum().plot( kind='bar', title='Annual USD Contract Obligations to CoreCivic' ).set_xlabel('Contract Year'); # In[74]: df[df['dollarsobligated'] > 0].groupby('contract_year')['dollarsobligated'].count().plot( kind='line', marker= 'o', title='DOJ Number of Annual Contracts to CoreCivic' ).set_xlabel('Contract Year'); # ## What does this look like state-to-state? # We can create a chloropleth with metadata hover-ables with Plotly. # In[18]: import us import numpy as np import plotly # ### set up plotly # [get started](https://plot.ly/python/getting-started/) on their website to get an API key. # In[45]: # aset up plotly creds plotly.tools.set_credentials_file( username='leonyin', api_key='') # In[60]: def group_df(df): df_t = df.copy() df_t['state_code'] = df_t['statecode'].str.split(':').str.get(0) df_merge = pd.merge( pd.merge(df_t.groupby('state_code')['unique_transaction_id'].count().reset_index(), df_t.groupby('state_code')['dollarsobligated'].sum().reset_index()), df_t.groupby('state_code')['dollarsobligated'].mean().reset_index(), on='state_code') df_merge.columns = ['state_code', 'contracts_count', 'total_dollars', 'mean_contract_amount'] return df_merge def build_annotation(df): S = [str(us.states.lookup(unicode(s))) for s in df['state_code'].tolist()] A = ["Avg. Contract Val: \${:,.2f}".format(int(a)) for a in df['mean_contract_amount'].tolist()] C = ["Num. of Contracts: {}".format(str(c)) for c in df['contracts_count'].tolist()] return ['
'.join([s,a,c]) for (s,a,c) in zip(S, A, C)] def chloropleth(df): df3 = group_df(df) scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'], [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']] data = [dict( type='choropleth', colorscale = scl, autocolorscale = False, locations = df3['state_code'], z = df3['total_dollars'], locationmode = 'USA-states', text = build_annotation(df3), marker = dict( line = dict ( color = 'rgb(255,255,255)', width = 2)), colorbar = dict( title = "USD"))] layout = dict( title = ('2000-2017 DOJ Total Contract Value with CoreCivic
' '(Hover for breakdown)'), geo = dict( scope='usa', projection=dict(type='albers usa'), showlakes=True, lakecolor='rgb(255, 255, 255)')) fig = dict( data=data, layout=layout ) return plotly.plotly.iplot( fig, filename='d3-cloropleth-map-corecivic' ) # In[61]: chloropleth(df) # ## Cross-contractor analysis # We can compare the contract data between two contractors. # Using the methods from the previous section, we can download GEO group, and compare the number of contracts and dollars obligated over time. # In[63]: def annual_spending(): fig, ax = plt.subplots(2, figsize=[10,6]) df_1[df_1['dollarsobligated'] > 0].groupby( 'contract_year')['dollarsobligated'].sum().plot( kind='line', marker='o', label='CoreCivic', ax=ax[0], title='DOJ Annual Contracts Obligations USD' ) df_2[df_2['dollarsobligated'] > 0].groupby( 'contract_year')['dollarsobligated'].sum().plot( kind='line', marker='o', label='GEO', color='lightblue', ax=ax[0] ).get_xaxis().set_visible(False) ax[0].xaxis.set_ticks(np.arange(2000, 2017, 2)) df_1[df_1['dollarsobligated'] > 0].groupby( 'contract_year')['dollarsobligated'].count().plot( kind='line', marker='o', ax=ax[1], title='DOJ Number of Annual Contracts' ) df_2[df_2['dollarsobligated'] > 0].groupby( 'contract_year')['dollarsobligated'].count().plot( kind='line', marker='o', color='lightblue', ax=ax[1] ).set_xlabel('Contract Year') ax[0].legend(); # In[ ]: get_ipython().run_line_magic('ls', '') # In[71]: cca = 'data_in/spending_corrections_corporation_of_america.tsv.gz' df_1 = pd.read_csv(cca, sep='\t', compression='gzip') geo = 'data_in/spending_geo_group.tsv.gz' df_2 = pd.read_csv(geo, sep='\t', compression='gzip') print("CCA recieved \${:,.2f} since 2000".format( df_1['dollarsobligated'].sum())) print("GEO Group recieved \${:,.2f} since 2000".format( df_2['dollarsobligated'].sum())) # In[72]: annual_spending() # Using these methods you can dig deeper, and present some nice graphs!