import pandas as pd import numpy as np import matplotlib.pyplot as plt from pandas import DataFrame health_data_path = 'data/Asthma/' enviro_data_path = 'data/Emissions/' er_csvs = ('Emergency_Visits', ['Tracking_Emergency_2005_CountyMeasures.csv', 'Tracking_Emergency_2006_CountyMeasures.csv', 'Tracking_Emergency_2007_CountyMeasures.csv', 'Tracking_Emergency_2008_CountyMeasures.csv', 'Tracking_Emergency_2009_CountyMeasures.csv']) hosp_csvs = ('Hospitalisations', ['Tracking_Hospital_2005_CountyMeasures.csv', 'Tracking_Hospital_2006_CountyMeasures.csv', 'Tracking_Hospital_2007_CountyMeasures.csv', 'Tracking_Hospital_2008_CountyMeasures.csv', 'Tracking_Hospital_2009_CountyMeasures.csv']) for i in range(len(er_csvs[1])): er_csvs[1][i] = health_data_path+er_csvs[1][i] for i in range(len(hosp_csvs[1])): hosp_csvs[1][i] = health_data_path+hosp_csvs[1][i] indicators = [hosp_csvs, er_csvs] import re def load_cali_health(filepath): m = re.match('.*(\d\d\d\d).*', filepath) #This regular expression parses for the year in the filename if m: year = int( m.group(1) ) else: raise Exception('no year found') with open(filepath, 'r') as csv_file: df = pd.read_csv(csv_file) df = df.set_index('cnty_fips') #Some entries are the string 'null' we replace this by the python object float('NaN') (Not a Number) df[year] = df['Age Adj. Rate per 10000'].apply(lambda x: x if x != 'null' else float('NaN')) #Now we can convert all years into floats, as the 'null' strings are gone, and float('NaN') is already a float df[year] = df[year].apply(float) return df[[year]] def create_df_from_csv(filepaths_list): dfs = [] for filepath in filepaths_list: dfs.append(load_cali_health(filepath)) df = pd.concat(dfs, axis = 1) return df indicator_dfs = {} for indicator in indicators: indicator_dfs[indicator[0]] = create_df_from_csv(indicator[1]) indicator_dfs['Emergency_Visits'].head() indicator_dfs['Hospitalisations'][2008].hist(bins = 12) indicator_dfs['Emergency_Visits'][2008].hist(bins = 12) air_quality_csvs = ['Ozone_2005.csv', 'Ozone_2006.csv', 'Ozone_2007.csv', 'Ozone_2008.csv', 'Ozone_2009.csv', 'PM25_2005.csv', 'PM25_2006.csv', 'PM25_2007.csv', 'PM25_2008.csv', 'PM25_2009.csv',] for i in range(len(air_quality_csvs)): air_quality_csvs[i] = enviro_data_path+air_quality_csvs[i] air_quality_csvs[0:] def generate_averaged_df(filepath): with open(filepath) as a: df = pd.read_csv(a) measures_df = df[['County Code', 'Site Num', 'Sample Measurement']] parameter = df['AQS Parameter Desc'][0] #The pollutants discriptor or name year = int(df['Year GMT'][0]) #This is the reason why a query can not span multiple years state = int(df['State Code'][0]) del df #As the ozone CSVs are around 700mb big, we free our memory as soon as possible #Our first mean calculation by site site_means = measures_df.groupby(['County Code', 'Site Num'], as_index = False).mean() #The second mean calculation by county county_means = site_means.groupby(['County Code'], as_index = False).mean() #We convert the County Code into a proper FIPS by adding it to the state code county_means['FIPS'] = county_means['County Code'].apply(lambda x: state*1000 + int(x) ) del county_means['County Code'] county_means = county_means.set_index(['FIPS']) county_means.columns = [year] return (county_means, parameter) pollutant_tuples = [] #for f in filepaths: for f in air_quality_csvs: pollutant_tuples.append(generate_averaged_df(f)) pollutant_dfs = {} for df_tups in pollutant_tuples: pollutant = df_tups[1] df = df_tups[0] if pollutant not in pollutant_dfs.keys(): pollutant_dfs[pollutant] = df else: pollutant_dfs[pollutant] = pd.concat([pollutant_dfs[pollutant], df], axis = 1) pollutant_dfs.keys() pollutant_dfs['Ozone'].head() pollutant_dfs['Ozone'][2008].hist(bins = 12) pollutant_dfs['PM2.5 - Local Conditions'][2008].hist(bins = 12) def calculate_all_r(pollutant_dict, health_dict): for pollutant in pollutant_dict.keys(): poll_df = pollutant_dict[pollutant] poll_df = poll_df.dropna() #asth['Age Adj. Rate per 10000'] = asth['Age Adj. Rate per 10000'].apply(lambda x: x if x != 'null' else float('NaN')) for indicator in health_dict.keys(): indi_df = health_dict[indicator] indi_df = indi_df.dropna() for year in pollutant_dict[pollutant].columns: print( pollutant, indicator, year, poll_df[year].corr(indi_df[year])) calculate_all_r(pollutant_dfs, indicator_dfs) def plot_all_scatterplots(pollutant_dict, health_dict): curr_fig = 0 #Every pollutant/indicator pair gets its own figure #Iterate through all pollutants for pollutant in pollutant_dict.keys(): #Load the dataframe of the current pollutant poll_df = pollutant_dict[pollutant] poll_df = poll_df.dropna() #Iterate through all health indicators for indicator in health_dict.keys(): #Load the dataframe of the current indicator indi_df = health_dict[indicator] indi_df = indi_df.dropna() #We calculate the number of scatter plots we will make per polluant/indicator pari num_years = len(pollutant_dict[pollutant].columns) num_columns = 2 num_rows = (num_years/2)+(num_years%2) #we arrange the scatter plots in a 2xn grid fig = plt.figure(curr_fig, figsize = (12, 6*num_rows)) fig_title = '%s and Asthma %s in Californa' %(pollutant, indicator) fig.suptitle(fig_title, fontsize=14) #We can already increment here curr_fig as it is not used again till the next indicator curr_fig += 1 #Iterate through years for j, year in zip(range(num_years), sorted(pollutant_dict[pollutant].columns)): #Join relevant year columns into a common dataframe #This is a convenient way to do an inner join, as the two datasets dont carry the same set of counties df = pd.concat([poll_df[[year]], indi_df[[year]]], axis = 1, join = 'inner', keys = [pollutant, indicator]) #Make subplot sp = plt.subplot(num_rows, num_columns, j+1) #the subplot enumeration starts at 1 not at 0 plt.title(str(year)) sp.scatter(df[df.columns[0]], df[df.columns[1]]) plt.xlabel(pollutant) plt.ylabel(indicator) #Add correlation coefficient text label r = df[df.columns[0]].corr(df[df.columns[1]]) sp.text(0.95, 0.95,'r=%s'%str(r), horizontalalignment='right', verticalalignment='top', transform=sp.transAxes, fontsize = 14) plot_all_scatterplots(pollutant_dfs, indicator_dfs) # http://www.geophysique.be/2013/02/12/matplotlib-basemap-tutorial-10-shapefiles-unleached-continued/ # # BaseMap example by geophysique.be # tutorial 10 import time import os import inspect import numpy as np import matplotlib.pyplot as plt from itertools import islice, izip from mpl_toolkits.basemap import Basemap from matplotlib.collections import LineCollection from matplotlib import cm import shapefile ### PARAMETERS FOR MATPLOTLIB : import matplotlib as mpl def zip_filter_by_state(records, shapes, included_states=None): # by default, no filtering # included_states is a list of states fips prefixes for (record, state) in izip(records, shapes): if record[0] in included_states: yield (record, state) # show only CA and AK (for example) def draw_mapy(df, data_title): num_years = len(df.columns) plot_num = 1 mpl.rcParams['font.size'] = 10. mpl.rcParams['font.family'] = 'Comic Sans MS' mpl.rcParams['axes.labelsize'] = 8. mpl.rcParams['xtick.labelsize'] = 6. mpl.rcParams['ytick.labelsize'] = 6. fig = plt.figure(figsize=(13.7,13)) #11.7,11 #Custom adjust of the subplots plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.15,wspace=0.15,hspace=0.05) for year in df.columns: # Issues with creating a loop in a function for normalizing #enviro_series = normalize(enviro_series) #health_series = normalize(health_series) year_series = df[year] maxval = year_series.max() minval = year_series.min() for i in year_series.index: year_series[i] = float((year_series[i]-minval)/(maxval-minval)) ax1 = fig.add_subplot(1,num_years,plot_num) ax1.set_title(data_title+" "+str(year)) draw_map(year_series, ax1) plot_num += 1 def draw_map(df_series, ax): # co-ordinates of california state x1 = -128. x2 = -114. y1 = 32. y2 = 43. m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2) m.drawparallels(np.arange(y1,y2,2.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels m.drawmeridians(np.arange(x1,x2,2.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data") # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")): shpf = shapefile.Reader(os.path.join(basemap_data_dir,"UScounties")) else: shpf = shapefile.Reader("/Users/raymondyee/C/src/basemap/lib/mpl_toolkits/basemap/data/UScounties") shapes = shpf.shapes() records = shpf.records() for record, shape in zip_filter_by_state(records, shapes, ['06']):# removed 02 state , we just need california lons,lats = zip(*shape.points) # lat long for each county data = np.array(m(lons, lats)).T if len(shape.parts) == 1: segs = [data,] else: segs = [] for i in range(1,len(shape.parts)): index = shape.parts[i-1] index2 = shape.parts[i] segs.append(data[index:index2]) segs.append(data[index2:]) lines = LineCollection(segs,antialiaseds=(1,)) color = lookupfips(record[2], df_series) lines.set_label('C') lines.set_facecolors(cm.Blues(color)) lines.set_edgecolors('k') lines.set_linewidth(0.1) ax.add_collection(lines) ax.plot() import pandas as pd import pickle def normalize(df_series): maxval = df_series.max() minval = df_series.min() for i in df_series.index: df_series[i] = float((df_series[i]-minval)/(maxval-minval)) print float((df_series[i]-minval)/(maxval-minval)) return df_series def lookupfips(x, df_series): if len(x) == 5: x = x[1:] x = float(x) try: val = df_series[x] return val except KeyError: return 0 def draw_maps(enviro_dict, health_dict): pm_key = "PM2.5 - Local Conditions" draw_mapy(enviro_dict['Ozone'], "Ozone Levels ") draw_mapy(health_dict['Hospitalisations'], "Hospital - Asthma ") draw_mapy(enviro_dict[pm_key], "PM2.5 ") draw_mapy(health_dict['Emergency_Visits'], "Emergency - Asthma ") draw_maps(pollutant_dfs, indicator_dfs)