#!/usr/bin/env python # coding: utf-8 # # Auditing Federal Contractors Part 1 # _By [Leon Yin](leonyin.org) Last Updated 2017-06-11_ # # View this notebook in [NBViewer](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/0_get_data.ipynb) or [Github](https://github.com/yinleon/us-spending/blob/master/0_get_data.ipynb) # # This two part module will show you how to request data from USASpending.gov, store it as a tab-separated value (tsv), and perform some temporal and spatial analysis. # # I hope these two modules are clear enough to be used by Journalists, Laywers, and other folks who wish to use data to audit government contractors # # Please view part 2 on [NBViewer](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) or [Github](https://github.com/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) # ## Downloading data from USA Spending # This notebook describes how to download a annual records from the USASpending.gov website for a specific contractor. # # In this example we download all records for the Core Corrections Association of America (CCA), and all it's subsidiaries. # # Let's start by getting all the Python packages we used in this module first. # In[5]: get_ipython().run_cell_magic('sh', '', 'pip install -r requirements.txt\n') # In[21]: get_ipython().run_line_magic('matplotlib', 'inline') import os from zipfile import ZipFile from multiprocessing import Pool import requests from io import BytesIO from itertools import repeat import datetime import pandas as pd # In[2]: latest_update = '20170515' next_latest = '20170115' year = 2017 dep= 'All' # In[3]: url = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/' 'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format( UP_MONTH=latest_update[:-2], DEP=dep, UP_DATE=latest_update)) url # In[4]: # for files that were not updated by the date above... url_legacy = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/' 'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format( UP_MONTH=next_latest[:-2], DEP=dep, UP_DATE=next_latest)) # In[5]: data_in = 'data_in/spending' # In[6]: # these are the years we are interested in: start = 2000 end = 2017 years = [y for y in range(start, end + 1)] # In[7]: # There are the aliases for each company, this is case insensitive, # but need to be enclose in a single or double quote! companies = ['Corrections Corporation of America', 'CoreCivic', 'TransCor'] # In[9]: def load_and_sift(year, regex): '''Downloads zipped tsv file from: https://www.usaspending.gov/DownloadCenter/Pages/dataarchives.aspx to a requests object. Expands zipfile and reads each file, chunkwise into Pandas dataframes. The dataframe (df) is filtered by the conpanies' RegEx expression. Args: year (int): The fiscal year of records to load. regex (string): A regex expression of company name(s). Returns: df: a Pandas Dataframe containing records from the given. ''' print(year) r = requests.get(url.format(YEAR=year)) last_update = datetime.datetime.strptime(latest_update, '%Y%m%d') if r.status_code == 404: # if url doesn't work, use the legacy url. r = requests.get(url_legacy.format(YEAR=year)) last_update = datetime.datetime.strptime(next_latest, '%Y%m%d') if r.status_code == 200: # make sure the download was successful. # the downloaded stream is a zip archive zipfile = ZipFile(BytesIO(r.content)) df_final = pd.DataFrame() # for each file in the zip archive for f in zipfile.namelist(): # process the file in dataframe chunks! for df in pd.read_csv(zipfile.open(f), sep='\t', chunksize=100000, low_memory=False): # filter the dataframe chunk for active vendors # and relevant company names. df = df[(~df['vendorname'].isnull()) & (df['vendorname'].str.contains(regex, case=False))] # some date tags... df['lastupdate'] = last_update df['contract_year'] = year df['filename'] = f df['search_terms'] = regex df_final = df_final.append(df, ignore_index=True) return df_final else: raise "bad request" # This next step might take a while. # It's expedited using `pool`, which parallelizes the task # # If is equivalent to # # ``` # for year, co in zip(years, repeat('|'.join(companies))): # df_list += load_and_sift(year, co) # df = pd.concat(df_list, ignore_index=True) # ``` # In[16]: with Pool() as pool: df_list = pool.starmap(load_and_sift, zip(years, repeat('|'.join(companies)))) df = pd.concat(df_list, ignore_index=True) # We can take a peek at 5 random records here: # In[9]: df.sample(5) # There are a lot of columns! So it might to hard to know what you're after. # In[10]: len(df.columns) # save the data to a gzipped tab-separated value document (tsv) # In[15]: outfile = data_in + '_' + companies[0].replace(' ', '_').lower() + '.tsv.gz' df.to_csv(outfile, sep='\t', compression='gzip', index=False) print("Data saved to {}".format(outfile)) # Now we're ready to perform some analysis! # # [See Part 2 here!](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/1_analysis_methods.ipynb)