import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame

health_data_path = 'data/Asthma/'
enviro_data_path = 'data/Emissions/'

er_csvs = ('Emergency_Visits', ['Tracking_Emergency_2005_CountyMeasures.csv', 'Tracking_Emergency_2006_CountyMeasures.csv',
           'Tracking_Emergency_2007_CountyMeasures.csv', 'Tracking_Emergency_2008_CountyMeasures.csv', 
           'Tracking_Emergency_2009_CountyMeasures.csv'])
hosp_csvs = ('Hospitalisations', ['Tracking_Hospital_2005_CountyMeasures.csv', 'Tracking_Hospital_2006_CountyMeasures.csv',
             'Tracking_Hospital_2007_CountyMeasures.csv', 'Tracking_Hospital_2008_CountyMeasures.csv',
             'Tracking_Hospital_2009_CountyMeasures.csv'])

for i in range(len(er_csvs[1])):
   er_csvs[1][i] = health_data_path+er_csvs[1][i]

for i in range(len(hosp_csvs[1])):
   hosp_csvs[1][i] = health_data_path+hosp_csvs[1][i]

indicators = [hosp_csvs, er_csvs]

import re
def load_cali_health(filepath):
    m = re.match('.*(\d\d\d\d).*', filepath) #This regular expression parses for the year in the filename
    if m:
        year = int( m.group(1) )
    else:
        raise Exception('no year found')
    with open(filepath, 'r') as csv_file:
        df = pd.read_csv(csv_file)
        df = df.set_index('cnty_fips')
        #Some entries are the string 'null' we replace this by the python object float('NaN') (Not a Number)
        df[year] = df['Age Adj. Rate per 10000'].apply(lambda x: x if x != 'null' else float('NaN'))
        #Now we can convert all years into floats, as the 'null' strings are gone, and float('NaN') is already a float
        df[year] = df[year].apply(float)
        return df[[year]]

def create_df_from_csv(filepaths_list):
    dfs = []
    for filepath in filepaths_list:
        dfs.append(load_cali_health(filepath))
    df = pd.concat(dfs, axis = 1)
    return df

indicator_dfs = {}
for indicator in indicators:
    indicator_dfs[indicator[0]] = create_df_from_csv(indicator[1])

indicator_dfs['Emergency_Visits'].head()

indicator_dfs['Hospitalisations'][2008].hist(bins = 12)

indicator_dfs['Emergency_Visits'][2008].hist(bins = 12)


air_quality_csvs = ['Ozone_2005.csv', 'Ozone_2006.csv', 'Ozone_2007.csv', 
                    'Ozone_2008.csv', 'Ozone_2009.csv',
                    'PM25_2005.csv', 'PM25_2006.csv', 'PM25_2007.csv',
                    'PM25_2008.csv', 'PM25_2009.csv',]

for i in range(len(air_quality_csvs)):
   air_quality_csvs[i] = enviro_data_path+air_quality_csvs[i]
air_quality_csvs[0:]


def generate_averaged_df(filepath):
    with open(filepath) as a:
        df = pd.read_csv(a)
        measures_df = df[['County Code', 'Site Num', 'Sample Measurement']]
        parameter = df['AQS Parameter Desc'][0] #The pollutants discriptor or name
        year = int(df['Year GMT'][0]) #This is the reason why a query can not span multiple years
        state = int(df['State Code'][0])
        del df #As the ozone CSVs are around 700mb big, we free our memory as soon as possible
        
        #Our first mean calculation by site
        site_means = measures_df.groupby(['County Code', 'Site Num'], as_index = False).mean()
        #The second mean calculation by county
        county_means = site_means.groupby(['County Code'], as_index = False).mean()
        
        #We convert the County Code into a proper FIPS by adding it to the state code
        county_means['FIPS'] = county_means['County Code'].apply(lambda x: state*1000 + int(x) )
        del county_means['County Code']
        county_means = county_means.set_index(['FIPS'])
        county_means.columns = [year]
        return (county_means, parameter)

pollutant_tuples = []
#for f in filepaths:
for f in air_quality_csvs:
    pollutant_tuples.append(generate_averaged_df(f))

pollutant_dfs = {}
for df_tups in pollutant_tuples:
    pollutant = df_tups[1]
    df = df_tups[0]
    if pollutant not in pollutant_dfs.keys():
        pollutant_dfs[pollutant] = df
    else:
        pollutant_dfs[pollutant] = pd.concat([pollutant_dfs[pollutant], df], axis = 1)

pollutant_dfs.keys()

pollutant_dfs['Ozone'].head()

pollutant_dfs['Ozone'][2008].hist(bins = 12)

pollutant_dfs['PM2.5 - Local Conditions'][2008].hist(bins = 12)

def calculate_all_r(pollutant_dict, health_dict):
    for pollutant in pollutant_dict.keys():
        poll_df = pollutant_dict[pollutant]
        poll_df = poll_df.dropna()
        #asth['Age Adj. Rate per 10000'] = asth['Age Adj. Rate per 10000'].apply(lambda x: x if x != 'null' else float('NaN'))
        for indicator in health_dict.keys():
            indi_df = health_dict[indicator]
            indi_df = indi_df.dropna()
            for year in pollutant_dict[pollutant].columns:
                print( pollutant, indicator, year, poll_df[year].corr(indi_df[year]))

calculate_all_r(pollutant_dfs, indicator_dfs)

def plot_all_scatterplots(pollutant_dict, health_dict):
    curr_fig = 0 #Every pollutant/indicator pair gets its own figure
    
    #Iterate through all pollutants
    for pollutant in pollutant_dict.keys():
        
        #Load the dataframe of the current pollutant
        poll_df = pollutant_dict[pollutant]
        poll_df = poll_df.dropna()
        
        #Iterate through all health indicators
        for indicator in health_dict.keys():
            
            #Load the dataframe of the current indicator
            indi_df = health_dict[indicator]
            indi_df = indi_df.dropna()
            
            #We calculate the number of scatter plots we will make per polluant/indicator pari
            num_years = len(pollutant_dict[pollutant].columns)
            num_columns = 2
            num_rows = (num_years/2)+(num_years%2) #we arrange the scatter plots in a 2xn grid
            fig = plt.figure(curr_fig, figsize = (12, 6*num_rows))
            
            
            fig_title = '%s and Asthma %s in Californa' %(pollutant, indicator)
            fig.suptitle(fig_title, fontsize=14)
            
            #We can already increment here curr_fig as it is not used again till the next indicator
            curr_fig += 1
            
            #Iterate through years
            for j, year in zip(range(num_years), sorted(pollutant_dict[pollutant].columns)):
                
                #Join relevant year columns into a common dataframe
                #This is a convenient way to do an inner join, as the two datasets dont carry the same set of counties
                df = pd.concat([poll_df[[year]], indi_df[[year]]], axis = 1, join = 'inner', keys = [pollutant, indicator])
                                          
                #Make subplot
                sp = plt.subplot(num_rows, num_columns, j+1) #the subplot enumeration starts at 1 not at 0
                plt.title(str(year))
                sp.scatter(df[df.columns[0]], df[df.columns[1]])
                plt.xlabel(pollutant)
                plt.ylabel(indicator)
                
                #Add correlation coefficient text label
                r = df[df.columns[0]].corr(df[df.columns[1]])
                sp.text(0.95, 0.95,'r=%s'%str(r),
                        horizontalalignment='right',
                        verticalalignment='top',
                        transform=sp.transAxes, 
                        fontsize = 14)

plot_all_scatterplots(pollutant_dfs, indicator_dfs)

# http://www.geophysique.be/2013/02/12/matplotlib-basemap-tutorial-10-shapefiles-unleached-continued/
#
# BaseMap example by geophysique.be
# tutorial 10
import time


import os

import inspect

import numpy as np

import matplotlib.pyplot as plt

from itertools import islice, izip

from mpl_toolkits.basemap import Basemap

from matplotlib.collections import LineCollection

from matplotlib import cm

import shapefile 

 
### PARAMETERS FOR MATPLOTLIB :

import matplotlib as mpl

def zip_filter_by_state(records, shapes, included_states=None):

    # by default, no filtering

    # included_states is a list of states fips prefixes

    for (record, state) in izip(records, shapes):

        if record[0] in included_states:

            yield (record, state) 

# show only CA and AK (for example)
def draw_mapy(df, data_title):
    
    num_years = len(df.columns)
    plot_num = 1
    
    mpl.rcParams['font.size'] = 10.    
    mpl.rcParams['font.family'] = 'Comic Sans MS'    
    mpl.rcParams['axes.labelsize'] = 8.    
    mpl.rcParams['xtick.labelsize'] = 6.    
    mpl.rcParams['ytick.labelsize'] = 6.     
    
    fig = plt.figure(figsize=(13.7,13)) #11.7,11
    
    #Custom adjust of the subplots    
    plt.subplots_adjust(left=0.05,right=0.95,top=0.90,bottom=0.15,wspace=0.15,hspace=0.05)
    
    for year in df.columns:

        # Issues with creating a loop in a function for normalizing
        #enviro_series = normalize(enviro_series)
        #health_series = normalize(health_series)
        year_series = df[year]
    
        maxval = year_series.max()
        minval = year_series.min()
    
        for i in year_series.index:
            year_series[i] = float((year_series[i]-minval)/(maxval-minval))

        ax1 = fig.add_subplot(1,num_years,plot_num)
        ax1.set_title(data_title+" "+str(year))
        draw_map(year_series, ax1)
        plot_num += 1


def draw_map(df_series, ax):
    
    # co-ordinates of california state
    x1 = -128.
    x2 = -114.
    y1 = 32.
    y2 = 43.
     
    m = Basemap(resolution='i',projection='merc', llcrnrlat=y1,urcrnrlat=y2,llcrnrlon=x1,urcrnrlon=x2,lat_ts=(y1+y2)/2)
    
    m.drawparallels(np.arange(y1,y2,2.),labels=[1,0,0,0],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw parallels
    m.drawmeridians(np.arange(x1,x2,2.),labels=[0,0,0,1],color='black',dashes=[1,0],labelstyle='+/-',linewidth=0.2) # draw meridians
    
    
    basemap_data_dir = os.path.join(os.path.dirname(inspect.getfile(Basemap)), "data")
   
    # this is my git clone of https://github.com/matplotlib/basemap --> these files will be in the PiCloud basemap_data_dir
    
    if os.path.exists(os.path.join(basemap_data_dir,"UScounties.shp")):
        
        shpf = shapefile.Reader(os.path.join(basemap_data_dir,"UScounties"))    
    else:    
        shpf = shapefile.Reader("/Users/raymondyee/C/src/basemap/lib/mpl_toolkits/basemap/data/UScounties")

    shapes = shpf.shapes()
    records = shpf.records()

    for record, shape in zip_filter_by_state(records, shapes, ['06']):# removed 02 state , we just need california
    
        lons,lats = zip(*shape.points) # lat long for each county
        data = np.array(m(lons, lats)).T
           
        if len(shape.parts) == 1:
            segs = [data,]
        else:
            segs = []
    
            for i in range(1,len(shape.parts)):
    
                index = shape.parts[i-1]    
                index2 = shape.parts[i]
    
                segs.append(data[index:index2])
    
            segs.append(data[index2:])     
    
        lines = LineCollection(segs,antialiaseds=(1,))

        color = lookupfips(record[2], df_series)

        lines.set_label('C') 
        lines.set_facecolors(cm.Blues(color))

        lines.set_edgecolors('k')
        lines.set_linewidth(0.1)

        ax.add_collection(lines)
        ax.plot()

import pandas as pd
import pickle


def normalize(df_series):
    maxval = df_series.max()
    minval = df_series.min()

    for i in df_series.index:
        df_series[i] = float((df_series[i]-minval)/(maxval-minval))
        print float((df_series[i]-minval)/(maxval-minval))
	return df_series

def lookupfips(x, df_series):
    if len(x) == 5:
        x = x[1:]
    x = float(x)
    try:
        val = df_series[x]
        return val
    except KeyError:
        return 0

def draw_maps(enviro_dict, health_dict):

    pm_key = "PM2.5 - Local Conditions"

    draw_mapy(enviro_dict['Ozone'], "Ozone Levels ")
    draw_mapy(health_dict['Hospitalisations'], "Hospital - Asthma ")

    draw_mapy(enviro_dict[pm_key], "PM2.5 ")
    draw_mapy(health_dict['Emergency_Visits'], "Emergency - Asthma ")

draw_maps(pollutant_dfs, indicator_dfs)