#!/usr/bin/env python
# coding: utf-8

# # Auditing Federal Contractors Part 1
# _By [Leon Yin](leonyin.org) Last Updated 2017-06-11_
# 
# View this notebook in [NBViewer](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/0_get_data.ipynb) or [Github](https://github.com/yinleon/us-spending/blob/master/0_get_data.ipynb)
# 
# This two part module will show you how to request data from USASpending.gov, store it as a tab-separated value (tsv), and perform some temporal and spatial analysis.
# 
# I hope these two modules are clear enough to be used by Journalists, Laywers, and other folks who wish to use data to audit government contractors
# 
# Please view part 2 on [NBViewer](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) or [Github](https://github.com/yinleon/us-spending/blob/master/1_analysis_methods.ipynb) 

# ## Downloading data from USA Spending
# This notebook describes how to download a annual records from the USASpending.gov website for a specific contractor.
# 
# In this example we download all records for the Core Corrections Association of America (CCA), and all it's subsidiaries.
# 
# Let's start by getting all the Python packages we used in this module first.

# In[5]:


get_ipython().run_cell_magic('sh', '', 'pip install -r requirements.txt\n')


# In[21]:


get_ipython().run_line_magic('matplotlib', 'inline')
import os
from zipfile import ZipFile
from multiprocessing import Pool
import requests
from io import BytesIO
from itertools import repeat
import datetime

import pandas as pd


# In[2]:


latest_update = '20170515'
next_latest  = '20170115'

year = 2017

dep= 'All'


# In[3]:


url = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/'
       'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format(
            UP_MONTH=latest_update[:-2], DEP=dep, UP_DATE=latest_update))
url


# In[4]:


# for files that were not updated by the date above...
url_legacy = ('http://download.usaspending.gov/data_archives/{UP_MONTH}/'
              'tsv/{{YEAR}}_{DEP}_Contracts_Full_{UP_DATE}.tsv.zip'.format(
                  UP_MONTH=next_latest[:-2], DEP=dep, UP_DATE=next_latest))


# In[5]:


data_in = 'data_in/spending'


# In[6]:


# these are the years we are interested in:
start = 2000
end = 2017
years = [y for y in range(start, end + 1)]


# In[7]:


# There are the aliases for each company, this is case insensitive, 
# but need to be enclose in a single or double quote!
companies =  ['Corrections Corporation of America', 
              'CoreCivic',
              'TransCor']


# In[9]:


def load_and_sift(year, regex):
    '''Downloads zipped tsv file from: 
    https://www.usaspending.gov/DownloadCenter/Pages/dataarchives.aspx
    to a requests object.
    
    Expands zipfile and reads each file, chunkwise into Pandas dataframes.
    The dataframe (df) is filtered by the conpanies' RegEx expression.
    
    Args:
       year (int):  The fiscal year of records to load.
       regex (string): A regex expression of company name(s).
    
    Returns:
       df: a Pandas Dataframe containing records from the given.
    '''
    print(year)
    r = requests.get(url.format(YEAR=year))
    last_update = datetime.datetime.strptime(latest_update, '%Y%m%d')
    
    if r.status_code == 404: # if url doesn't work, use the legacy url.
        r = requests.get(url_legacy.format(YEAR=year))
        last_update = datetime.datetime.strptime(next_latest, '%Y%m%d')
            
    if r.status_code == 200: # make sure the download was successful.
                     
        # the downloaded stream is a zip archive
        zipfile = ZipFile(BytesIO(r.content))        
        df_final = pd.DataFrame()
        
        # for each file in the zip archive
        for f in zipfile.namelist():              
            # process the file in dataframe chunks!
            for df in pd.read_csv(zipfile.open(f), sep='\t',
                                  chunksize=100000, low_memory=False):
 
                # filter the dataframe chunk for active vendors
                # and relevant company names.
                df = df[(~df['vendorname'].isnull()) &
                        (df['vendorname'].str.contains(regex, case=False))]
                
                # some date tags...
                df['lastupdate'] = last_update
                df['contract_year'] = year
                df['filename'] = f
                df['search_terms'] = regex
                                
                df_final = df_final.append(df, ignore_index=True)
        
        return df_final
    
    else:
        raise "bad request"


# This next step might take a while.
# It's expedited using `pool`, which parallelizes the task
# 
# If is equivalent to
# 
# ```
# for year, co in zip(years, repeat('|'.join(companies))):
#     df_list += load_and_sift(year, co)
# df = pd.concat(df_list, ignore_index=True)  
# ```

# In[16]:


with Pool() as pool:
    df_list = pool.starmap(load_and_sift, zip(years, repeat('|'.join(companies))))
df = pd.concat(df_list, ignore_index=True)


# We can take a peek at 5 random records here:

# In[9]:


df.sample(5)


# There are a lot of columns! So it might to hard to know what you're after.

# In[10]:


len(df.columns)


# save the data to a gzipped tab-separated value document (tsv)

# In[15]:


outfile = data_in + '_' + companies[0].replace(' ', '_').lower() + '.tsv.gz'
df.to_csv(outfile, sep='\t', compression='gzip', index=False)
print("Data saved to {}".format(outfile))


# Now we're ready to perform some analysis!
# 
# [See Part 2 here!](http://nbviewer.jupyter.org/github/yinleon/us-spending/blob/master/1_analysis_methods.ipynb)