#Set the year
year = 2010
#Create the output csv file
outFilename = "../Data/AllStatesUsage{}.csv".format(year)
#Import built-in modules
import os, urllib
#Import pandas, install if needed
try:
import pandas as pd
except:
import pip
pip.main(['install','pandas'])
import pandas as pd
#Import us, install if needed
try:
import us
except:
import pip
pip.main(['install','us'])
import us
def getData(state_abbr,year):
'''Downloads USGS data and creates a tidy dataframe of all the usage data for the state'''
#Set the data URL path and parameters and construct the url
path = 'https://waterdata.usgs.gov/{}/nwis/water_use?'.format(state_abbr)
values = {'format':'rdb',
'rdb_compression':'value',
'wu_area':'County',
'wu_year': year,
'wu_county':'ALL',
'wu_county_nms':'--ALL+Counties--',
'wu_category_nms':'--ALL+Categories--'
}
url = path + urllib.urlencode(values)
#Pull data in using the URL and remove the 2nd row of headers
dfRaw = pd.read_table(url,comment='#',header=[0,1],na_values='-')
dfRaw.columns = dfRaw.columns.droplevel(level=1)
#Tidy the data: transform so data in each usage column become row values with a new column listing the usage type
rowHeadings = ['county_cd', 'county_nm', 'state_cd', 'state_name', 'year']
dfTidy = pd.melt(dfRaw,id_vars=rowHeadings,value_name='MGal',var_name='Group')
#Remove rows that don't have volume data (i.e. keep only columns with 'Mgal' in the name)
dfTidy = dfTidy[dfTidy['Group'].str.contains('Mgal')].copy(deep=True)
#Change the type of the MGal column to float
dfTidy['MGal'] = dfTidy.MGal.astype('float')
#Summarize county data for the whole state
stateSummary = dfTidy.groupby(['Group'])['MGal'].sum()
#Convert to a dataframe
dfState = pd.DataFrame(stateSummary)
#Rename the MGal column to the state abbreviation
dfState.columns = [state_abbr]
#Return the dataframe
return dfState
#Get the data for the first state
abbr = us.states.STATES_CONTINENTAL[0].abbr
print "Processing {}, ".format(abbr),
dfAll = getData(abbr,year)
Processing AL,
#Loop through the remaining states
for state in us.states.STATES_CONTINENTAL[1:]:
abbr = state.abbr
print abbr,
dfState = getData(abbr,year)
dfAll = pd.merge(dfAll,dfState,how='inner',left_index='Group',right_index='Group')
AK AZ AR CA CO CT DE DC FL GA ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY
dfAll.to_csv(outFilename)