#!/usr/bin/env python # coding: utf-8 # ### Retrieve State Usage Data # # This notebook downloads water use data for a user specified state and year from the National Water Information System server, and translates these data into physical supply and use table (PSUT), using a preformatted PSUT template and value mapping table. # # Original data are from the USGS state water programs. Below is a sample URL for Louisiana: # `https://waterdata.usgs.gov/la/nwis/water_use?format=rdb&rdb_compression=value&wu_area=County&wu_year=2000%2C2005%2C2010&wu_county=ALL&wu_category=ALL&wu_county_nms=--ALL%2BCounties--&wu_category_nms=--ALL%2BCategories--` # # ##### Workflow: # * Get the state and year from the user # * Get the filenames for the PSUT template and value mapping table # * Construct the url and retrieve the data into a pandas data frame # * Melt/gather the usage columns into row values under the column name 'Group' # * Remove rows with no usage data (identified by not having "Mgal" in the 'Group' name) # # #### Required 3rd party modules # The following may require installation before running this notebook. These can each be installed using `pip` # * Pandas` ------>pip install pandas` # * OpenPyXl` ---->pip install openpyxl` # --- # `Created September 2017`
`John.Fay@duke.edu` # # --- # ### Begin by specifying the 2-character state code and the year to process # In[1]: #Specify the state and year to process state = 'la' #Louisiana year = 2010 # ### Next, specify the path to the template PSUT and the value mapping filenames # In[2]: #Get the input filenames templateFilename = '../Data/Templates/StatePSUT_Template.xlsx' remapFilename = '../Data/RemapTables/StatePSUTLookup.csv' # ### The rest is automated... # #### First, we need to import required packages, installing 3rd party packages if required # In[3]: #Import built-in modules import sys, os, urllib from shutil import copyfile # In[4]: #Import pandas, install if necessary... try: import pandas as pd except: import pip pip.main(['install','pandas']) import pandas as pd # In[5]: #Import openpyxl, install if necessary... try: from openpyxl import load_workbook except: import pip pip.main(['install','openpyxl']) from openpyxl import load_workbook # ##### Output data will be named with the state code and year (e.g. `la_2010.csv`) and will be saved in the StateData subfolder of the Data directory. # This folder will be created, if it does not exist already. And then the PSUT template will be copied to this location using the output filename. # In[6]: #Create the StateData folder, if not already present outFolder = '..\\Data\\StateData' if os.path.exists(outFolder) == False: os.mkdir(outFolder) # In[7]: #Set the output filename, putting it in the output data folder outFilename = outFolder + os.sep + '{0}_{1}.xlsx'.format(state,year) print("Output will be written to " + outFilename) # In[8]: #Copy the template to the output filename copyfile(src=templateFilename,dst=outFilename) # ##### Load in the value mapping table as a data frame # In[9]: #Get the remap table and load as a dataframe dfRemap = pd.read_csv(remapFilename,dtype='str',index_col="Index") # ##### Construct the URL for data retreival and then pull the on-line data to an in-memory data frame named `dfRaw`. # In[10]: #Set the data URL path and parameters and construct the url path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state) values = {'format':'rdb', 'rdb_compression':'value', 'wu_area':'County', 'wu_year': year, 'wu_county':'ALL', 'wu_county_nms':'--ALL+Counties--', 'wu_category_nms':'--ALL+Categories--' } url = path + urllib.urlencode(values) # In[11]: #Pull data in using the URL and remove the 2nd row of headers dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-') dfRaw.columns = dfRaw.columns.droplevel(level=1) # In[12]: #CHECK: Display a sample of the retrieved data dfRaw.head() # ##### With the data now held locally, we reformat the table so that each of the usage (and other) columns are "[melted](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html)" into a single column such that the original column name is stored in a new field and the value is stored in another. This facilitates subsequent analysis, which includes: # * Removing rows (formerly columns) that don't report volume, e.g. population served values, and # * Ensuring that volume data is a floating point number. # In[13]: #Tidy the data: transform so data in each usage column become row values with a new column listing the usage type rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year'] dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group') print("Data transformed from {0} rows/columns to {1} rows/columns".format(dfRaw.shape, dfTidy.shape)) # In[14]: #Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name) dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True) dfTidy.shape # In[15]: #Change the type of the MGal column to float dfTidy['MGal'] = dfTidy.MGal.astype("float") # In[16]: #CHECK: Show the structure of the 'tidied' data frame dfTidy.head() # ##### Next, we join the value mapping table to the tidied data frame, thus linking the values (in MGal/year) with the address of the PSUT cell in which they should be ported. # In[17]: #Join the remap table dfAll = pd.merge(dfTidy,dfRemap,how='inner',left_on="Group",right_on="Group") dfAll.head() # ##### With the tables joined, we can open the output worksheet and start porting over values # In[18]: os.path.exists(outFilename) # In[19]: #Open the spreadsheet template wb = load_workbook(filename=outFilename) ws = wb['template'] ws.title = str(year) # ##### Notes on mapping values in the dataframe to PSUT cells # * As some water usage categories are inserted into more than one cell - some in as many as three - in the PSUT, the value mapping file contains three sets of cell coordinates (i.e. column/row pairs). Thus, we repeat the process of reading the dataframe and mapping values to the PSUT three times. # # * In cases where two or more usage categories map to the same PSUT cell, the values are summed. This is done in the `dfAll.groupby` statement. # # * Where no data exist for a particular usage category, the value `n/a` is inserted. # In[20]: #Loop through the first set of row/columns and insert values into the Excel spreadsheet dfRound1 = dfAll.groupby(['Column1','Row1'])['MGal'].sum() dfRound1.fillna(value="n/a",inplace=True) for (row,column), value in dfRound1.iteritems(): #Set the value in the workbook rv = str(row)+ str(column) ws[rv] = value #Save the workbook wb.save(outFilename) # In[21]: #Loop through the second set of row/columns and insert values into the Excel spreadsheet dfRound2 = dfAll.groupby(['Column2','Row2'])['MGal'].sum() dfRound2.fillna(value="n/a",inplace=True) for (row,column), value in dfRound2.iteritems(): #Set the value in the workbook rv = str(row)+ str(column) ws[rv] = value #Save the workbook wb.save(outFilename) # In[22]: #Loop through the third set of row/columns and insert values into the Excel spreadsheet dfRound3 = dfAll.groupby(['Column3','Row3'])['MGal'].sum() dfRound3.fillna(value="n/a",inplace=True) for (row,column), value in dfRound3.iteritems(): #print row, column, value #Set the value in the workbook rv = str(row)+ str(column) ws[rv] = value #Save the workbook wb.save(outFilename) # ##### Open the notebook # In[23]: os.startfile(outFilename)