git clone https://github.com/djgagne/ucar_sea_2015.git
# Useful general libraries
%matplotlib inline
import numpy as np
from scipy.ndimage import gaussian_filter
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.basemap import Basemap
from collections import OrderedDict
# Data Processing
import pandas as pd
from netCDF4 import Dataset
# Custom image segmentation libraries
from hail.EnhancedWatershedSegmenter import EnhancedWatershed
from hail.Hysteresis import Hysteresis
Radar-estimated hail size based on a 3D radar reflectivity mosaic with 1 km grid-spacing
# Load MESH data and plot it
with Dataset("data/MESH_4km_20140603-00:00_20140603-23:00") as mesh_file:
lon = mesh_file.variables['lon'][:]
lat = mesh_file.variables['lat'][:]
mesh_data = mesh_file.variables['MESH'][:]
mesh_hour = 22
bmap_hail = Basemap(projection='cyl', resolution='l',
llcrnrlon=-105, urcrnrlon=-92,
llcrnrlat=40, urcrnrlat=45)
plt.figure(figsize=(10,6))
bmap_hail.drawstates()
bmap_hail.drawlsmask()
cont = plt.contourf(lon, lat, mesh_data[mesh_hour],
np.arange(5,105,5), cmap="PuBuGn_r")
cbar = plt.colorbar(orientation="horizontal",shrink=0.9,fraction=0.1, pad=0.01)
title_obj = plt.title("MESH Swaths: June 3, 2014 {0:02d}00 UTC".format(mesh_hour), fontsize=20)
from skimage.measure import regionprops
#Hysteresis (low threshold, high threshold)
hyst = Hysteresis(5,10)
# Enhanced Watershed (low threshold,
# discretization interval,
# discretization maximum,
# size threshold,
# number of watershed iterations)
ew = EnhancedWatershed(5,1,100,100,50)
labels = OrderedDict()
labels['Hysteresis'] = hyst.label(gaussian_filter(mesh_data[22], 1))
print "Number of Hysteresis Objects:", labels['Hysteresis'].max()
labels['Enhanced Watershed'] = ew.label(gaussian_filter(mesh_data[22],1))
print "Number of Enhanced Watershed Objects:", labels['Enhanced Watershed'].max()
labels['Enhanced Watershed Size Filter'] = ew.size_filter(labels['Enhanced Watershed'], 50)
print "Number of Size Filtered Objects:", labels['Enhanced Watershed Size Filter'].max()
Number of Hysteresis Objects: 21 Number of Enhanced Watershed Objects: 30 Number of Size Filtered Objects: 7
# Plot objects
cm = ListedColormap(np.random.random((50,3)))
plt.figure(figsize=(14,6))
plt.subplot(2,2,1)
bmap_hail.drawstates()
plt.title("MESH", fontsize=20)
cont = plt.contourf(lon, lat, mesh_data[22],
np.arange(5,105,5), cmap="PuBuGn_r")
sp = 2
for plot_name, image in labels.iteritems():
plt.subplot(2,2,sp)
bmap_hail.drawstates()
ht = plt.title(plot_name, fontsize=20)
pcol = plt.contourf(lon, lat, np.ma.array(image, mask=image < 1),
np.arange(image.max() + 1), cmap=cm)
#Find centroids
props = regionprops(image)
centroids = np.round([prop.centroid for prop in props]).astype(int)
plt.scatter(lon[centroids[:,0],centroids[:,1]],
lat[centroids[:,0],centroids[:,1]], 10, 'r')
sp += 1
# Red regions are objects, foothills regions are in orange
pixels, quant_map = ew.quantize(gaussian_filter(mesh_data[22], 1))
marked = ew.find_local_maxima(gaussian_filter(mesh_data[22],1))
plt.figure(figsize=(30,15))
bmap_hail.drawstates()
plt.contourf(lon, lat, marked,[-3,-2,0, 100],colors=['orange','white','red'])
<matplotlib.contour.QuadContourSet instance at 0x12cbbaef0>
solar_training_data = pd.read_csv("data/grafs_train_data.csv")
solar_testing_data = pd.read_csv("data/grafs_test_data.csv")
print solar_training_data.columns
Index([u'station', u'date', u'valid_hour_utc', u'valid_hour_pst', u'forecast_hour', u'day_of_year', u'sine_doy', u'row', u'col', u'asp', u'dem', u'slope', u'T_f', u'T_f_mean', u'T_f_max', u'T_f_min', u'T_f_median', u'T_f_correlate', u'T_f_gradient', u'av_dswrf_sfc_f', u'av_dswrf_sfc_f_mean', u'av_dswrf_sfc_f_max', u'av_dswrf_sfc_f_min', u'av_dswrf_sfc_f_median', u'av_dswrf_sfc_f_correlate', u'av_dswrf_sfc_f_gradient', u'cloud_cover_f', u'cloud_cover_f_mean', u'cloud_cover_f_max', u'cloud_cover_f_min', u'cloud_cover_f_median', u'cloud_cover_f_correlate', u'cloud_cover_f_gradient', u'dewpt_f', u'dewpt_f_mean', u'dewpt_f_max', u'dewpt_f_min', u'dewpt_f_median', u'dewpt_f_correlate', u'dewpt_f_gradient', u'index', u'av_dswrf_sfc', u'lat', u'lon', u'add_diff'], dtype='object')
train_stations = solar_training_data['station'].unique()
max_solar = np.zeros(train_stations.size)
station_lats = np.zeros(train_stations.size)
station_lons = np.zeros(train_stations.size)
for ts, train_station in enumerate(train_stations):
is_station = solar_training_data['station'] == train_station
station_idxs = np.nonzero(is_station)[0]
max_solar[ts] = solar_training_data['av_dswrf_sfc'][is_station].max()
station_lons[ts] = solar_training_data['lon'][station_idxs[0]]
station_lats[ts] = solar_training_data['lat'][station_idxs[0]]
bmap = Basemap(projection='cyl', resolution='l',
llcrnrlon=-125, urcrnrlon=-93, llcrnrlat=25, urcrnrlat=43)
plt.figure(figsize=(15,7))
bmap.drawstates()
bmap.drawcoastlines()
bmap.drawcountries()
plt.title("Training Stations", fontsize=20)
plt.scatter(station_lons, station_lats, 20,max_solar,cmap=plt.get_cmap('Reds'))
<matplotlib.collections.PathCollection at 0x1093709d0>
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
# Linear regression
from sklearn.linear_model import Ridge
ridge = Pipeline([('anova',SelectKBest(f_regression,k=15)),('ridge',Ridge(alpha=1))])
# Random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=6)
# Gradient Boosting Regression Trees
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, loss='lad', max_depth=6)
#K-Nearest Neighbors Regression
from sklearn.neighbors import KNeighborsRegressor
knr = Pipeline([('anova',SelectKBest(f_regression,k=15)),
('knn',KNeighborsRegressor(n_neighbors=10, weights='uniform'))])
input_columns = np.array(['valid_hour_pst', 'forecast_hour', 'asp', 'dem', 'slope','lat','lon',
'T_f', 'T_f_mean', 'T_f_max', 'T_f_min', 'T_f_median', 'T_f_correlate', 'T_f_gradient',
'av_dswrf_sfc_f', 'av_dswrf_sfc_f_mean', 'av_dswrf_sfc_f_max', 'av_dswrf_sfc_f_min',
'av_dswrf_sfc_f_median', 'av_dswrf_sfc_f_correlate', 'av_dswrf_sfc_f_gradient',
'cloud_cover_f', 'cloud_cover_f_mean', 'cloud_cover_f_max', 'cloud_cover_f_min',
'cloud_cover_f_median', 'cloud_cover_f_correlate', 'cloud_cover_f_gradient'])
output_column = 'av_dswrf_sfc'
print "Fitting ridge"
ridge.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting random forest"
rf.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting gradient boosted regression"
gbr.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting KNR"
knr.fit(solar_training_data[input_columns], solar_training_data[output_column])
Fitting ridge Fitting random forest Fitting gradient boosted regression Fitting KNR
Pipeline(steps=[('anova', SelectKBest(k=15, score_func=<function f_regression at 0x10a39aa28>)), ('knn', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=10, p=2, weights='uniform'))])
Feature Importance quantifies the average increase in error caused by randomizing the values of an input feature.
rf_var_importance_rankings = np.argsort(rf.feature_importances_)[::-1]
gbr_var_importance_rankings = np.argsort(gbr.feature_importances_)[::-1]
knr_var_names = knr.steps[0][1].get_support(indices=True)
print "Random Forest".ljust(34) + " Gradient Boosting".ljust(35)
for r, g in zip(rf_var_importance_rankings[:15],gbr_var_importance_rankings[:15]):
print "{0}: {1:0.3f} | {2}: {3:0.3f}".format(input_columns[r].ljust(25),
rf.feature_importances_[r],
input_columns[g].ljust(25),
gbr.feature_importances_[g])
Random Forest Gradient Boosting av_dswrf_sfc_f_correlate : 0.187 | av_dswrf_sfc_f_min : 0.535 av_dswrf_sfc_f_median : 0.146 | av_dswrf_sfc_f_max : 0.157 av_dswrf_sfc_f_mean : 0.136 | lon : 0.043 av_dswrf_sfc_f : 0.124 | lat : 0.035 av_dswrf_sfc_f_max : 0.094 | av_dswrf_sfc_f_correlate : 0.030 av_dswrf_sfc_f_min : 0.093 | valid_hour_pst : 0.030 forecast_hour : 0.066 | slope : 0.030 valid_hour_pst : 0.048 | dem : 0.029 cloud_cover_f_max : 0.019 | forecast_hour : 0.027 cloud_cover_f_median : 0.016 | av_dswrf_sfc_f_median : 0.016 cloud_cover_f_correlate : 0.013 | av_dswrf_sfc_f_mean : 0.015 cloud_cover_f_mean : 0.012 | asp : 0.012 cloud_cover_f : 0.008 | av_dswrf_sfc_f_gradient : 0.007 cloud_cover_f_min : 0.007 | T_f_gradient : 0.006 T_f_median : 0.005 | T_f_max : 0.004
def mean_absolute_error(forecast, obs):
return np.mean(np.abs(forecast - obs))
predictions = {}
predictions['Ridge Regression'] = ridge.predict(solar_testing_data[input_columns])
predictions['Random Forest'] = rf.predict(solar_testing_data[input_columns])
predictions['Gradient Boosting'] = gbr.predict(solar_testing_data[input_columns])
predictions['KNR'] = knr.predict(solar_testing_data[input_columns])
print 'NAM'.ljust(20) + "{0:0.3f}".format(mean_absolute_error(solar_testing_data['av_dswrf_sfc_f'],
solar_testing_data[output_column]))
for model_name, preds in predictions.iteritems():
print model_name.ljust(20) + "{0:0.3f}".format(mean_absolute_error(preds, solar_testing_data[output_column]))
NAM 73.475 KNR 77.720 Ridge Regression 77.994 Random Forest 69.318 Gradient Boosting 66.678
plt.figure(figsize=(15,4))
m = 1
for model_name, preds in predictions.iteritems():
plt.subplot(1,len(predictions.keys()),m)
plt.scatter(predictions[model_name], solar_testing_data[output_column],1,'k')
plt.plot(np.arange(0,900,100),np.arange(0,900,100),'r--')
plt.xlim(0,800)
plt.ylim(0,800)
plt.xlabel(model_name, fontsize=14)
if m == 1:
plt.ylabel("Obs", fontsize=14)
m += 1
test_stations = solar_testing_data['station'].unique()
station_errors = np.zeros((test_stations.size))
station_lats = np.zeros(test_stations.size)
station_lons = np.zeros(test_stations.size)
#Calculate the mean absolute error at each station
for ts, test_station in enumerate(test_stations):
is_station = solar_testing_data['station'] == test_station
station_idxs = np.nonzero(is_station)[0]
station_errors[ts] = mean_absolute_error(predictions['Gradient Boosting'][np.nonzero(is_station)],
solar_testing_data.loc[is_station,output_column])
station_lons[ts] = solar_testing_data['lon'][station_idxs[0]]
station_lats[ts] = solar_testing_data['lat'][station_idxs[0]]
#Plot the station errors
bmap = Basemap(projection='cyl', llcrnrlon=-125, urcrnrlon=-93, llcrnrlat=25, urcrnrlat=43)
plt.figure(figsize=(15,7))
bmap.drawstates()
bmap.drawcoastlines()
bmap.drawcountries()
plt.title("Gradient Boosting Regression Mean Absolute Error by Station", fontsize=18)
plt.scatter(station_lons, station_lats, np.sqrt(station_errors) * 4,station_errors,
vmin=station_errors.min(),
vmax=np.percentile(station_errors,95),
cmap='RdBu_r')
plt.colorbar()
<matplotlib.colorbar.Colorbar instance at 0x11d27ce18>
def plot_station_errors(solar_testing_data, predictions, model_name, station_number):
station_idxs = np.where(solar_testing_data['station']==station_number)[0]
plt.plot(predictions[station_idxs],'ro-',label=model)
plt.plot(solar_testing_data[output_column].values[station_idxs],'bo-',label='Obs')
plt.ylim(0,700)
plt.xticks(np.arange(len(station_idxs)), solar_testing_data['valid_hour_pst'].values[station_idxs])
plt.title(model_name, fontsize=16)
#Stations are ordered by their gradient boosted regression error
station_number = test_stations[np.argsort(station_errors)][-2]
plt.figure(figsize=(15,6))
for m, model in enumerate(predictions.iterkeys()):
plt.subplot(2, len(predictions.keys()) / 2, m + 1)
if m % 2 == 0:
plt.ylabel("Global Horizontal Irradiance")
plot_station_errors(solar_testing_data, predictions[model], model, station_number)
if m > 1:
plt.xlabel("Valid Hour (PST)")
Forecast Observation