#!/usr/bin/env python
# coding: utf-8
#
Utilizing Scientific Python Tools for the Application of Data Science Techniques to High Impact Weather Prediction
# David John Gagne II
# University of Oklahoma/National Center for Atmospheric Research
# April 13, 2015
2015 UCAR SEA Conference
# Twitter: @DJGagneDos
Github: https://www.github.com/djgagne
# ##Acknowledgements
# * OU Advisor: Amy McGovern
# * NCAR Advisors: Sue Haupt and John Williams
# * Severe Hail Analysis, Representation and Prediction (SHARP) Team: Nate Snook, Youngsun Jung, Jon Labriola
# * Gridded Atmopsheric Forecast System (GRAFS) Team: Seth Linden, Gerry Wiener, Bill Petzke, Jared Lee
# * CAPS: Ming Xue, Kevin Thomas, Keith Brewster, Yunheng Wang, Chris Cook
# * NOAA: Jimmy Correia, Adam Clark, Michael Coniglio
# ##WARNING: Do ~~NOT~~ Try This At Home!
# * Goal: Demonstrate the power of Python Data Science tools
# * Provide a sandbox for users to experiment with real weather data
# * Presentation, source code, and data available at https://www.github.com/djgagne/ucar_sea_2015/
# * Requires standard Anaconda Python Distribution with netCDF4-python, Basemap, and IPython 3.0
# * If you have git installed: `git clone https://github.com/djgagne/ucar_sea_2015.git`
# * Otherwise download the zip file: https://github.com/djgagne/ucar_sea_2015/archive/master.zip
# ## Motivation
# * High impact weather includes weather phenomena that cause large personal and/or economic losses
# * Includes extreme weather disasters
# * Hurricanes, tornadoes, __hail__, floods
# * Also includes "normal" weather events with economic effects
# * __Unforseen partly cloudy day causes unexpected fluctuations in solar power generation__
# * Mild, dry, weather closes ski resort early
# ##Python and Data Science in High Impact Weather Prediction
# * Bridge the gap from raw numerical weather prediction (NWP) model output and observations to actionable information
# * Compiling historical model runs and observations
# * Pre-processing data
# * Applying statistical learning models to correct output and make derived products
# * Visualizing results
# #Interactive Demonstrations
# * Hail Size Prediction
# * Locating hail swaths with image processing techniques
# * Solar Irradiance Prediction
# * Train statistical learning models
# * Validate model predictions
# # Hail Size Prediction
# * Current hail forecasts tend to be coarse in area and time
# * Storm-scale models can produce high resolution simulations of hail-producing storms
# * Storm-scale models do not produce best estimates of hail size or uncertainty
# * Statistical learning models can provide calibrated size and uncertainty estimates
# ##Python Libraries
#
# In[1]:
# Useful general libraries
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
from scipy.ndimage import gaussian_filter
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.basemap import Basemap
from collections import OrderedDict
# Data Processing
import pandas as pd
from netCDF4 import Dataset
# Custom image segmentation libraries
from hail.EnhancedWatershedSegmenter import EnhancedWatershed
from hail.Hysteresis import Hysteresis
# ## Data: NOAA NSSL Multi-Radar Multi-Sensor Maximum Estimated Size of Hail (MESH)
# Radar-estimated hail size based on a 3D radar reflectivity mosaic with 1 km grid-spacing
# In[2]:
# Load MESH data and plot it
with Dataset("data/MESH_4km_20140603-00:00_20140603-23:00") as mesh_file:
lon = mesh_file.variables['lon'][:]
lat = mesh_file.variables['lat'][:]
mesh_data = mesh_file.variables['MESH'][:]
mesh_hour = 22
bmap_hail = Basemap(projection='cyl', resolution='l',
llcrnrlon=-105, urcrnrlon=-92,
llcrnrlat=40, urcrnrlat=45)
plt.figure(figsize=(10,6))
bmap_hail.drawstates()
bmap_hail.drawlsmask()
cont = plt.contourf(lon, lat, mesh_data[mesh_hour],
np.arange(5,105,5), cmap="PuBuGn_r")
cbar = plt.colorbar(orientation="horizontal",shrink=0.9,fraction=0.1, pad=0.01)
title_obj = plt.title("MESH Swaths: June 3, 2014 {0:02d}00 UTC".format(mesh_hour), fontsize=20)
# ##Image Segmentation
# * Split image into distinct regions or objects based on intensity values
# * Thresholding: objects are contiguous areas with intensities above a specified threshold
# * Hysteresis: contiguous areas above a threshold containing at least 1 point above a second threshold
# * Watershed: objects are "grown" from local maxima in layers until a minimum intensity is met
# * Enhanced Watershed (Lakshmanan et al. 2009): Adds size criteria, buffer regions around objects, and other enhancements
# ## Image Segmentation Process
# In[47]:
from skimage.measure import regionprops
#Hysteresis (low threshold, high threshold)
hyst = Hysteresis(5,10)
# Enhanced Watershed (low threshold,
# discretization interval,
# discretization maximum,
# size threshold,
# number of watershed iterations)
ew = EnhancedWatershed(5,1,100,100,50)
labels = OrderedDict()
labels['Hysteresis'] = hyst.label(gaussian_filter(mesh_data[22], 1))
print "Number of Hysteresis Objects:", labels['Hysteresis'].max()
labels['Enhanced Watershed'] = ew.label(gaussian_filter(mesh_data[22],1))
print "Number of Enhanced Watershed Objects:", labels['Enhanced Watershed'].max()
labels['Enhanced Watershed Size Filter'] = ew.size_filter(labels['Enhanced Watershed'], 50)
print "Number of Size Filtered Objects:", labels['Enhanced Watershed Size Filter'].max()
# ## Image Segmentation Examples
# In[48]:
# Plot objects
cm = ListedColormap(np.random.random((50,3)))
plt.figure(figsize=(14,6))
plt.subplot(2,2,1)
bmap_hail.drawstates()
plt.title("MESH", fontsize=20)
cont = plt.contourf(lon, lat, mesh_data[22],
np.arange(5,105,5), cmap="PuBuGn_r")
sp = 2
for plot_name, image in labels.iteritems():
plt.subplot(2,2,sp)
bmap_hail.drawstates()
ht = plt.title(plot_name, fontsize=20)
pcol = plt.contourf(lon, lat, np.ma.array(image, mask=image < 1),
np.arange(image.max() + 1), cmap=cm)
#Find centroids
props = regionprops(image)
centroids = np.round([prop.centroid for prop in props]).astype(int)
plt.scatter(lon[centroids[:,0],centroids[:,1]],
lat[centroids[:,0],centroids[:,1]], 10, 'r')
sp += 1
# In[49]:
# Red regions are objects, foothills regions are in orange
pixels, quant_map = ew.quantize(gaussian_filter(mesh_data[22], 1))
marked = ew.find_local_maxima(gaussian_filter(mesh_data[22],1))
plt.figure(figsize=(30,15))
bmap_hail.drawstates()
plt.contourf(lon, lat, marked,[-3,-2,0, 100],colors=['orange','white','red'])
# ##Hysteresis vs. Enhanced Watershed
# * Hysteresis
# 1. Only 2 parameters to set
# 2. Captures entirety of regions
# 3. Can combine seemingly distinct objects
# * Enhanced Watershed
# 1. Captures different scales depending on size threshold
# 2. Captures core of objects
# 3. Can separate even adjacent objects
# ##Solar Irradiance Prediction
# * Solar power is providing a rapidly growing share of the electricity generated worldwide
# * Electric grids are more vulnerable to irregular fluxations in solar energy generation
# * Improved forecasting of solar irradiance leads to fewer surprises, cheaper solar energy
# * Best forecasting methods combine NWP output with a statistical learning model
# ## Gridded Atmospheric Forecast System (GRAFS)
# * Best statistical forecasts come from 1 model fitted to 1 site with a long data record
# * With rapid growth of new solar sites and distributed solar, harder to build up data record
# * Solution: create a statistical learning model framework that learns from many stations and applies predictions to a grid
# ##Loading Data
# * NWP Output comes from downscaled NAM
# * Observations are from MADIS pyranometers in the Southwest US
# * Pandas library can be used to read csv files quickly and import data into a Data Frame
# In[6]:
solar_training_data = pd.read_csv("data/grafs_train_data.csv")
solar_testing_data = pd.read_csv("data/grafs_test_data.csv")
print solar_training_data.columns
# In[7]:
train_stations = solar_training_data['station'].unique()
max_solar = np.zeros(train_stations.size)
station_lats = np.zeros(train_stations.size)
station_lons = np.zeros(train_stations.size)
for ts, train_station in enumerate(train_stations):
is_station = solar_training_data['station'] == train_station
station_idxs = np.nonzero(is_station)[0]
max_solar[ts] = solar_training_data['av_dswrf_sfc'][is_station].max()
station_lons[ts] = solar_training_data['lon'][station_idxs[0]]
station_lats[ts] = solar_training_data['lat'][station_idxs[0]]
bmap = Basemap(projection='cyl', resolution='l',
llcrnrlon=-125, urcrnrlon=-93, llcrnrlat=25, urcrnrlat=43)
plt.figure(figsize=(15,7))
bmap.drawstates()
bmap.drawcoastlines()
bmap.drawcountries()
plt.title("Training Stations", fontsize=20)
plt.scatter(station_lons, station_lats, 20,max_solar,cmap=plt.get_cmap('Reds'))
# ##Statistical (Machine) Learning Models
# * Scikit-learn provides a straightforward Python interface to many common statistical learning models
# In[8]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
# Linear regression
from sklearn.linear_model import Ridge
ridge = Pipeline([('anova',SelectKBest(f_regression,k=15)),('ridge',Ridge(alpha=1))])
# Random forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=6)
# Gradient Boosting Regression Trees
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, loss='lad', max_depth=6)
#K-Nearest Neighbors Regression
from sklearn.neighbors import KNeighborsRegressor
knr = Pipeline([('anova',SelectKBest(f_regression,k=15)),
('knn',KNeighborsRegressor(n_neighbors=10, weights='uniform'))])
# ## Fitting the Models
# In[9]:
input_columns = np.array(['valid_hour_pst', 'forecast_hour', 'asp', 'dem', 'slope','lat','lon',
'T_f', 'T_f_mean', 'T_f_max', 'T_f_min', 'T_f_median', 'T_f_correlate', 'T_f_gradient',
'av_dswrf_sfc_f', 'av_dswrf_sfc_f_mean', 'av_dswrf_sfc_f_max', 'av_dswrf_sfc_f_min',
'av_dswrf_sfc_f_median', 'av_dswrf_sfc_f_correlate', 'av_dswrf_sfc_f_gradient',
'cloud_cover_f', 'cloud_cover_f_mean', 'cloud_cover_f_max', 'cloud_cover_f_min',
'cloud_cover_f_median', 'cloud_cover_f_correlate', 'cloud_cover_f_gradient'])
output_column = 'av_dswrf_sfc'
print "Fitting ridge"
ridge.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting random forest"
rf.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting gradient boosted regression"
gbr.fit(solar_training_data[input_columns], solar_training_data[output_column])
print "Fitting KNR"
knr.fit(solar_training_data[input_columns], solar_training_data[output_column])
# ## Ranking Input Features
# Feature Importance quantifies the average increase in error caused by randomizing the values of an input feature.
# In[10]:
rf_var_importance_rankings = np.argsort(rf.feature_importances_)[::-1]
gbr_var_importance_rankings = np.argsort(gbr.feature_importances_)[::-1]
knr_var_names = knr.steps[0][1].get_support(indices=True)
print "Random Forest".ljust(34) + " Gradient Boosting".ljust(35)
for r, g in zip(rf_var_importance_rankings[:15],gbr_var_importance_rankings[:15]):
print "{0}: {1:0.3f} | {2}: {3:0.3f}".format(input_columns[r].ljust(25),
rf.feature_importances_[r],
input_columns[g].ljust(25),
gbr.feature_importances_[g])
# ## Comparing Model Error
# In[11]:
def mean_absolute_error(forecast, obs):
return np.mean(np.abs(forecast - obs))
predictions = {}
predictions['Ridge Regression'] = ridge.predict(solar_testing_data[input_columns])
predictions['Random Forest'] = rf.predict(solar_testing_data[input_columns])
predictions['Gradient Boosting'] = gbr.predict(solar_testing_data[input_columns])
predictions['KNR'] = knr.predict(solar_testing_data[input_columns])
print 'NAM'.ljust(20) + "{0:0.3f}".format(mean_absolute_error(solar_testing_data['av_dswrf_sfc_f'],
solar_testing_data[output_column]))
for model_name, preds in predictions.iteritems():
print model_name.ljust(20) + "{0:0.3f}".format(mean_absolute_error(preds, solar_testing_data[output_column]))
# ## Visualizing Model Predictions
# In[12]:
plt.figure(figsize=(15,4))
m = 1
for model_name, preds in predictions.iteritems():
plt.subplot(1,len(predictions.keys()),m)
plt.scatter(predictions[model_name], solar_testing_data[output_column],1,'k')
plt.plot(np.arange(0,900,100),np.arange(0,900,100),'r--')
plt.xlim(0,800)
plt.ylim(0,800)
plt.xlabel(model_name, fontsize=14)
if m == 1:
plt.ylabel("Obs", fontsize=14)
m += 1
# ##Visualizing Station Error
# In[13]:
test_stations = solar_testing_data['station'].unique()
station_errors = np.zeros((test_stations.size))
station_lats = np.zeros(test_stations.size)
station_lons = np.zeros(test_stations.size)
#Calculate the mean absolute error at each station
for ts, test_station in enumerate(test_stations):
is_station = solar_testing_data['station'] == test_station
station_idxs = np.nonzero(is_station)[0]
station_errors[ts] = mean_absolute_error(predictions['Gradient Boosting'][np.nonzero(is_station)],
solar_testing_data.loc[is_station,output_column])
station_lons[ts] = solar_testing_data['lon'][station_idxs[0]]
station_lats[ts] = solar_testing_data['lat'][station_idxs[0]]
#Plot the station errors
bmap = Basemap(projection='cyl', llcrnrlon=-125, urcrnrlon=-93, llcrnrlat=25, urcrnrlat=43)
plt.figure(figsize=(15,7))
bmap.drawstates()
bmap.drawcoastlines()
bmap.drawcountries()
plt.title("Gradient Boosting Regression Mean Absolute Error by Station", fontsize=18)
plt.scatter(station_lons, station_lats, np.sqrt(station_errors) * 4,station_errors,
vmin=station_errors.min(),
vmax=np.percentile(station_errors,95),
cmap='RdBu_r')
plt.colorbar()
# ## Individual Site Forecasts
# * Visualize the time series of forecasts and observations
# In[24]:
def plot_station_errors(solar_testing_data, predictions, model_name, station_number):
station_idxs = np.where(solar_testing_data['station']==station_number)[0]
plt.plot(predictions[station_idxs],'ro-',label=model)
plt.plot(solar_testing_data[output_column].values[station_idxs],'bo-',label='Obs')
plt.ylim(0,700)
plt.xticks(np.arange(len(station_idxs)), solar_testing_data['valid_hour_pst'].values[station_idxs])
plt.title(model_name, fontsize=16)
#Stations are ordered by their gradient boosted regression error
station_number = test_stations[np.argsort(station_errors)][-2]
plt.figure(figsize=(15,6))
for m, model in enumerate(predictions.iterkeys()):
plt.subplot(2, len(predictions.keys()) / 2, m + 1)
if m % 2 == 0:
plt.ylabel("Global Horizontal Irradiance")
plot_station_errors(solar_testing_data, predictions[model], model, station_number)
if m > 1:
plt.xlabel("Valid Hour (PST)")
# Forecast Observation
# ## Summary
# * Predicting high impact weather requires interpretation of large amounts of data
# * Python and data science tools can ease every step of the data interpretation process
# * Presentation, code, and data available at https://www.github.com/djgagne/ucar_sea_2015/
# * Please download the presentation and experiment with the data and code
# * Questions? **Email**: dgagne@ucar.edu **Twitter**: @DJGagneDos