#!/usr/bin/env python # coding: utf-8 # ### Compile State Usage Data # Loops through a list of all states an pulls USGS water use data for each state into a single table. # # #### Required packages: # * pandas # * us # --- # `Created September 2017`
`John.Fay@duke.edu` # # --- # ### Set year and the output filename # In[1]: #Set the year year = 2010 # In[2]: #Create the output csv file outFilename = "../Data/AllStatesUsage{}.csv".format(year) # ### The rest is automated... # #### First, we need to import required packages, installing 3rd party packages if required # In[3]: #Import built-in modules import os, urllib # In[4]: #Import pandas, install if needed try: import pandas as pd except: import pip pip.main(['install','pandas']) import pandas as pd # In[5]: #Import us, install if needed try: import us except: import pip pip.main(['install','us']) import us # #### Define the function that will pull the data, reshape it, and create a 'tidy' data frame # In[6]: def getData(state_abbr,year): '''Downloads USGS data and creates a tidy dataframe of all the usage data for the state''' #Set the data URL path and parameters and construct the url path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state_abbr) values = {'format':'rdb', 'rdb_compression':'value', 'wu_area':'County', 'wu_year': year, 'wu_county':'ALL', 'wu_county_nms':'--ALL+Counties--', 'wu_category_nms':'--ALL+Categories--' } url = path + urllib.urlencode(values) #Pull data in using the URL and remove the 2nd row of headers dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-') dfRaw.columns = dfRaw.columns.droplevel(level=1) #Tidy the data: transform so data in each usage column become row values with a new column listing the usage type rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year'] dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group') #Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name) dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True) #Change the type of the MGal column to float dfTidy['MGal'] = dfTidy.MGal.astype('float') #Summarize county data for the whole state stateSummary = dfTidy.groupby(['Group'])['MGal'].sum() #Convert to a dataframe dfState = pd.DataFrame(stateSummary) #Rename the MGal column to the state abbreviation dfState.columns = [state_abbr] #Return the dataframe return dfState # ##### Import data for the first state into a tidy data frame # In[7]: #Get the data for the first state abbr = us.states.STATES_CONTINENTAL[0].abbr print "Processing {}, ".format(abbr), dfAll = getData(abbr,year) # ###### Loop through the remaining states and merge them to the data frame created above # In[8]: #Loop through the remaining states for state in us.states.STATES_CONTINENTAL[1:]: abbr = state.abbr print abbr, dfState = getData(abbr,year) dfAll = pd.merge(dfAll,dfState,how='inner',left_index='Group',right_index='Group') # ##### Write the output to a csv file # In[9]: dfAll.to_csv(outFilename)