Initilization of the graphics system and computational modules used in this IPython notebook.
# Display graphics inline with the notebook
%matplotlib inline
# Standard Python modules
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import os
import datetime
# Modules to display images and data tables
from IPython.display import Image
from IPython.core.display import display
# Data Directory
dir = '../data/'
img = '../images/'
A crude estimate of inflows can be found starting with the balance equation
$$A_{RL}\frac{dH_{RL}}{dt} = I_{RL}(t) - O_{RL}(t)$$where the subscript RL refers to Rainy Lake, and $A$, $H$, $I$, and $O$ refer to area, lake level, inflow, and outflow, respectively. Discretizing in time
$$\frac{A_{RL}}{\Delta t}\left(H_{RL}(k+1)-H_{RL}(k)\right) = I_{RL}(k) - O_{RL}(k)$$Since the sole output flow from Rainy Lake is through Rainy River, we can solve for inflow
$$I_{RL}(k) = \frac{A_{RL}}{\Delta t}\left(H_{RL}(k+1)-H_{RL}(k)\right) + O_{RL}(k)$$The estimation method is sensitive to small errors in level measurement and therefore provides an estimate of the inflow plagued by excessive noise.
# Load Rainy River Flow and Rainy Lake Level data
RR = pd.read_pickle(dir + 'RR.pkl')
RL = pd.read_pickle(dir + 'RL.pkl')
# Load regression formulae for lake areas
import pickle
with open(dir + 'area.pkl', 'rb') as handle:
area = pickle.load(handle)
area
{'Namakan Reservoir': poly1d([ 20.28853147, -6652.59327508]), 'Rainy Lake': poly1d([ 90.73298003, -29751.96877469])}
# Compute time series for Rainy Lake area in sq. meters
areaRL = pd.Series(area['Rainy Lake'](RL), index = RL.index) * 1.0e6
# Estimate Rainy Lake inflow in cubic meters per second
RLinflow = areaRL*(RL.shift(-1) - RL)/(24*3600) + RR
plt.figure(figsize=(10,3.5))
RLinflow.plot(linewidth=0.5)
plt.title('Rainy Lake Inflows {0}-{1}'.format(
RLinflow.index.min().strftime('%Y'),
RLinflow.index.max().strftime('%Y')))
plt.ylabel('cubic meters/sec')
plt.grid()
Examining the estimated inflows to Rainy Lake, there are evident inconsistencies in the data prior to 1920, and that may extend through to the about 1970. For that reason, only data from 1970 on is retained in a data set to be used for simulating lake levels under alternative control regimes.
# Plot raw data from 1970 forward
plt.figure(figsize=(10,6))
plt.subplot(3,1,1)
RLinflow['1970':].plot()
plt.title('Rainy Lake Inflow 1970--')
plt.grid()
plt.subplot(3,1,2)
RR['1970':].plot()
plt.title('Rainy Lake Outflow 1970--')
plt.grid()
plt.subplot(3,1,3)
RL['1970':].plot()
plt.title('Rainy Lake Level 1970--')
plt.grid()
plt.tight_layout()
# Plot flow histograms and save image
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1,1,1)
plt.hold(True)
RLinflow['1970':].hist(bins=range(0, 3000 + 25, 25), \
color = 'blue', alpha = 0.5, log=True, normed=1)
RR['1970':].hist(bins=range(0, 3000 + 25, 25), \
color = 'yellow', alpha = 0.5, log=True, normed=1)
plt.plot([950.0,950.0],[0,0.001],'r',lw=2,alpha=0.6)
plt.hold(False)
plt.xlim(-50,2000)
plt.ylim(0,0.01)
plt.xlabel('Flow [cubic meters/second]')
plt.ylabel('Frequency')
plt.legend(['Outflow at AGO Level', 'Inflow','Outflow'])
plt.title('Frequency vs Flowrates for Rainy Lake, 1970-2014')
plt.savefig(img + 'RLFlowFreq.png')
# Plot level histograms and save image
fig = plt.figure(figsize=(10,4))
ax = fig.add_subplot(1,1,1)
plt.hold(True)
RL['1970':].hist(bins=np.arange(336.3,339.0,0.05), \
color = 'blue', alpha = 0.5, log=True, normed=1)
plt.plot([337.9,337.9],[0.001,1.0],'r',lw=2)
plt.hold(False)
plt.xlim(336.3,339.0)
plt.ylim(0.001,3)
plt.legend(['AGO','Level'])
plt.xlabel('Level [meters]')
plt.ylabel('Frequency')
plt.title('Frequency vs Stage for Rainy Lake, 1970-2014')
plt.savefig(img + 'RLLevelFreq.png')
yr = '1970'
RLLevelFlow = pd.concat([RL[yr:],RLinflow[yr:],RR[yr:],],axis=1)
RLLevelFlow.columns = ['H','I','O']
RLLevelFlow = RLLevelFlow.interpolate()
RLLevelFlow.to_csv(dir + 'RLLevelFlow.csv')
RLLevelFlow['1971']
H | I | O | |
---|---|---|---|
1971-01-01 | 337.365 | 389.608017 | 479.0 |
1971-01-02 | 337.356 | 419.462053 | 479.0 |
1971-01-03 | 337.350 | 376.833097 | 476.0 |
1971-01-04 | 337.340 | 383.844301 | 473.0 |
1971-01-05 | 337.331 | 406.309788 | 436.0 |
1971-01-06 | 337.328 | 398.319239 | 428.0 |
1971-01-07 | 337.325 | 368.657381 | 428.0 |
1971-01-08 | 337.319 | 368.695186 | 428.0 |
1971-01-09 | 337.313 | 322.099488 | 411.0 |
1971-01-10 | 337.304 | 381.394850 | 411.0 |
1971-01-11 | 337.301 | 368.808603 | 428.0 |
1971-01-12 | 337.295 | 398.423204 | 428.0 |
1971-01-13 | 337.292 | 334.865311 | 394.0 |
1971-01-14 | 337.286 | 349.451558 | 379.0 |
1971-01-15 | 337.283 | 379.000000 | 379.0 |
1971-01-16 | 337.283 | 313.075689 | 382.0 |
1971-01-17 | 337.276 | 290.449188 | 379.0 |
1971-01-18 | 337.267 | 377.000000 | 377.0 |
1971-01-19 | 337.267 | 349.511417 | 379.0 |
1971-01-20 | 337.264 | 261.083472 | 379.0 |
1971-01-21 | 337.252 | 298.117347 | 357.0 |
1971-01-22 | 337.246 | 295.155152 | 354.0 |
1971-01-23 | 337.240 | 315.192958 | 374.0 |
1971-01-24 | 337.234 | 288.846145 | 377.0 |
1971-01-25 | 337.225 | 254.789521 | 382.0 |
1971-01-26 | 337.212 | 320.369383 | 379.0 |
1971-01-27 | 337.206 | 289.110782 | 377.0 |
1971-01-28 | 337.197 | 340.000000 | 340.0 |
1971-01-29 | 337.197 | 243.195845 | 331.0 |
1971-01-30 | 337.188 | 298.760302 | 328.0 |
... | ... | ... | ... |
1971-12-02 | 337.792 | 675.191433 | 779.0 |
1971-12-03 | 337.782 | 747.888935 | 779.0 |
1971-12-04 | 337.779 | 651.593543 | 776.0 |
1971-12-05 | 337.767 | 679.808574 | 773.0 |
1971-12-06 | 337.758 | 586.787272 | 773.0 |
1971-12-07 | 337.740 | 604.780018 | 770.0 |
1971-12-08 | 337.724 | 674.214982 | 767.0 |
1971-12-09 | 337.715 | 643.400059 | 767.0 |
1971-12-10 | 337.703 | 610.689101 | 765.0 |
1971-12-11 | 337.688 | 604.925384 | 759.0 |
1971-12-12 | 337.673 | 622.673446 | 756.0 |
1971-12-13 | 337.660 | 688.546579 | 750.0 |
1971-12-14 | 337.654 | 619.168769 | 742.0 |
1971-12-15 | 337.642 | 619.319990 | 742.0 |
1971-12-16 | 337.630 | 603.293944 | 736.0 |
1971-12-17 | 337.617 | 587.267898 | 719.0 |
1971-12-18 | 337.604 | 571.241851 | 725.0 |
1971-12-19 | 337.591 | 555.215805 | 728.0 |
1971-12-20 | 337.578 | 539.189759 | 722.0 |
1971-12-21 | 337.560 | 627.765004 | 719.0 |
1971-12-22 | 337.551 | 526.572361 | 719.0 |
1971-12-23 | 337.532 | 509.735830 | 722.0 |
1971-12-24 | 337.511 | 590.713533 | 742.0 |
1971-12-25 | 337.496 | 533.529744 | 745.0 |
1971-12-26 | 337.475 | 562.088778 | 753.0 |
1971-12-27 | 337.456 | 602.579907 | 753.0 |
1971-12-28 | 337.441 | 580.816191 | 731.0 |
1971-12-29 | 337.426 | 575.052475 | 725.0 |
1971-12-30 | 337.411 | 589.250257 | 719.0 |
1971-12-31 | 337.398 | 565.592246 | 745.0 |
365 rows × 3 columns
Plotting the estimated inflow for a typical year demonstrates the noisy characteristics of this estimate.
yr = '2014'
plt.figure(figsize=(9,7))
plt.subplot(2,1,1)
plt.hold(True)
RLinflow[yr].plot(color='b',linewidth=1)
RR[yr].plot(color='r')
plt.hold(False)
ax = plt.axis()
plt.ylim([0,ax[3]])
plt.ylabel('[cubic meters/sec]')
plt.title('Estimated Inflow to Rainy Lake ' + yr)
plt.legend(['Inflow','Outflow'],loc='upper left')
plt.subplot(2,1,2)
RL[yr].plot()
plt.ylabel('Level [meters]')
plt.title('Rainy Lake Level')
plt.tight_layout()
For this first model we assume the inflows and outflows to Rainy Lake are exogenous disturbances driven by zero mean white noise. Later we can consider models that incorporate features such as mean seasonal inflows, the dependence of outflow on lake level, and the an additional state variable representing the International Falls Dam. The purpose of this first model is to simply test the idea that a filtering and estimation techniques can provide better knowledge of the ungauged inflows to Rainy Lake.
The model is written
$$\underbrace{\left[\begin{array}{c} H_{RL}(k+1) \\ I_{RL}(k+1) \\ O_{RL}(k+1) \end{array}\right]}_{x(k+1)} = \underbrace{\left[ \begin{array}{ccc} 1 & \frac{\Delta t}{A_{RL}} & -\frac{\Delta t}{A_{RL}} \\0 & 1 & 0\\0 & 0 & 1\end{array}\right]}_{A} \underbrace{\left[\begin{array}{c} H_{RL}(k) \\ I_{RL}(k) \\ O_{RL}(k) \end{array}\right]}_{x(k)} + \underbrace{\left[ \begin{array}{c} 0 \\ w_{I}(k) \\ w_{O}(k)\end{array}\right]}_{w(k)}$$where $H$, $I$, $O$ are lake level, inflow, and outflow, respectively. The signals $w_I(k)$, and $w_O(k)$ are zero-mean independent and identically distributed random increments to changing lake inflow and outflows. The vector $w(k)$ is assumed to be distributed as multivariate Normal distribution with zero mean and a covariance $Q$.
The measurement model is written
$$\underbrace{\left[\begin{array}{c} y_H(k) \\ y_O(k) \end{array}\right]}_{y(k)} = \underbrace{\left[\begin{array}{ccc} 1 & 0 & 0 \\ 0 & 0 & 1 \end{array}\right]}_{C} \underbrace{\left[\begin{array}{c} H_{RL}(k) \\ I_{RL}(k) \\ O_{RL}(k) \end{array}\right]}_{x(k)} + \underbrace{\left[\begin{array}{c} v_{H}(k) \\ v_{O}(k) \end{array}\right]}_{v(k)} $$where the measurement noise vector $v(k)$ is an i.i.d. random variate with zero mean and a covariance $R$.
A Kalman filter provides a two step method for estimate values of the state vector $x(k)$. The the estimate of $x(k)$ using all information available up to time $k-1$ is the prediction step given by
$$\hat{x}(k|k-1) = A \hat{x}(k-1|k-1)$$with covariance
$$\hat{P}(k|k-1) = A \hat{P}(k-1|k-1) A^T + Q$$In the measurement update step, an innovation is the difference between the actual measurement at time $k$ versus the predicted measurement
$$e(k) = y(k) - C \hat{x}(k|k-1)$$with covariance
$$S(k) = C\hat{P}(k|k-1)C^T + R$$The Kalman filter gain is given by
$$K(k) = \hat{P}(k|k-1)C^TS^{-1}(k)$$which is used to compute the updated state estimate
$$\hat{x}(k|k) = \hat{x}(x|k-1) + K(k)e(k)$$and
$$\hat{P}(k|k) = \hat{P}(k|k-1) - K(k)S(k)K^T(k)$$This is implemented in the following cells for the purpose of estimating inflow to Rainy Lake from historical data of lake levels and outflows.
areaRL = float(97500*10000)
dt = float(24*3600)
A = np.array([[1.0, dt/areaRL, -dt/areaRL],
[0.0, 1.0, 0.0],
[0.0, 0.0, 1.0]])
C = np.array([[1.0, 0.0, 0.0],
[0.0, 0.0, 1.0]])
# Covariance of state inputs
Q = np.array([[0.0, 0.0, 0.0],
[0.0, 10000.0, 0.0],
[0.0, 0.0, 10000.0]])
# Measurement covariance
R = np.array([[0.0001, 0],[0.0, 1000.0]])
# Load Rainy Lake Level and Rainy River Flow data
yrA = '1971'
yrB = '2010'
y = pd.concat([pd.read_pickle(dir+'RL.pkl')[yrA:yrB],
pd.read_pickle(dir+'RR.pkl')[yrA:yrB]],axis=1)
y.columns = ['H','O']
y = y.interpolate()
# DataFrame to store results
x = pd.DataFrame(index = y.index,columns=('H','I','O'))
e = pd.DataFrame(index = y.index,columns=('H','O'))
# Initial values for x and P
x.ix[0] = np.array([y.ix[0]['H'], y.ix[0]['O'], y.ix[0]['O']])
e.ix[0] = np.array([0,0])
P = np.array([[0.001, 0.0, 0.0], [0.0, 100.0, 0.0], [0.0, 0.0, 100.0]])
# Kalman Filter
for k in range(1,len(y.index)):
# Prediction
xpred = np.dot(A,x.ix[k-1])
Ppred = np.dot(np.dot(A,P),np.transpose(A)) + Q
# Measurement
ymeas = np.array([y.ix[k]['H'],y.ix[k]['O']])
# Update
e.ix[k] = ymeas - np.dot(C,xpred)
S = np.dot(np.dot(C,Ppred),np.transpose(C)) + R
K = np.dot(np.dot(Ppred,np.transpose(C)),np.linalg.inv(S))
x.ix[k] = xpred + np.dot(K,e.ix[k])
P = Ppred - np.dot(np.dot(K,S),np.transpose(K))
# Pickle Rainy Lake Inflows for use in other notebooks
x['I'].to_pickle('../data/RLInflows.pkl')
x['H'].to_pickle('../data/RLLevels.pkl')
x['O'].to_pickle('../data/RLOutflows.pkl')
x.to_csv('../data/RLEstimates.csv')
plt.figure(figsize=(9,7))
plt.subplot(2,1,1)
plt.hold(True)
x['I'].plot(linewidth=0.5)
x['O'].plot(linewidth=0.5)
plt.hold(False)
plt.title('Estimated Inflow and Outflows ' + yrA + '-' + yrB)
plt.legend(['Estimated Inflow (I)','Estimated Outflow (O)'])
plt.subplot(2,1,2)
plt.hold(True)
x['H'].plot(linewidth=0.5)
RL[yrA:yrB].plot(linewidth=0.5)
plt.hold(False)
plt.title('Estimated Lake Level ' + yrA + '-' + yrB)
plt.legend(['Estimated Level (H)','Measured Level'])
plt.tight_layout()
plt.figure(figsize=(9,7))
plt.subplot(2,1,1)
e['H'].plot(linewidth=0.5)
plt.title('Level Estimation Error ' + yrA + '-' + yrB)
plt.subplot(2,1,2)
e['O'].plot(linewidth=0.5)
plt.title('Outflow Estimation Error ' + yrA + '-' + yrB)
plt.tight_layout()
# Shift to a common year
I = pd.DataFrame([])
for yr,r in x['I'].groupby(x.index.year):
shift = datetime.datetime(2014,1,1) - datetime.datetime(yr,1,1)
r = r.tshift(shift.days)
r.name = yr
I = pd.concat([I,r],axis=1)
# Plot
plt.figure(figsize=(9,6))
plt.subplot(2,1,1)
idx = range(1971,2000)
I[idx].mean(axis=1).plot(color='r',linewidth=4)
plt.hold(True)
I[idx].plot(ax=plt.gca(),legend=False)
I[idx].mean(axis=1).plot(color='r',linewidth=4)
plt.ylim(0,1800)
plt.title('Rainy Lake Inflow 1971-1999')
plt.hold(False)
import matplotlib.dates as mdates
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))
# Plot
plt.subplot(2,1,2)
idx = range(2000,2011)
I[idx].mean(axis=1).plot(color='r',linewidth=4)
plt.hold(True)
I[idx].plot(ax=plt.gca(),legend=False)
I[idx].mean(axis=1).plot(color='r',linewidth=4)
plt.ylim(0,1800)
plt.title('Rainy Lake Inflow 2000-2010')
plt.hold(False)
import matplotlib.dates as mdates
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))
plt.tight_layout()
plt.figure(figsize=(12,6))
plt.hold(True)
q_hi = 80
q_lo = 20
idx = range(1971,2000)
plt.plot(I[idx].index,
np.percentile(I[idx],q=q_hi,axis=1),
color='y',alpha=0.6)
a, = plt.plot(I[idx].index,
np.percentile(I[idx],q=q_lo,axis=1),
color='y',alpha=0.6,label='1971-1999')
plt.fill_between(I[idx].index,
np.percentile(I[idx],q=q_hi,axis=1).tolist(),
np.percentile(I[idx],q=q_lo,axis=1).tolist(),
color='y',alpha=0.6)
idx = range(2000,2011)
plt.plot(I[idx].index,
np.percentile(I[idx],q=q_hi,axis=1),
color='b',alpha=0.4)
b, = plt.plot(I[idx].index,
np.percentile(I[idx],q=q_lo,axis=1),
color='b',alpha=0.4,label='2000-2010')
plt.fill_between(I[idx].index,
np.percentile(I[idx],q=q_hi,axis=1).tolist(),
np.percentile(I[idx],q=q_lo,axis=1).tolist(),
color='b',alpha=0.4)
plt.hold(False)
plt.title('{:d}th to {:d}th Percentiles of Rainy Lake Inflows'.format(q_lo,q_hi))
plt.ylabel('[cubic meters/sec]')
plt.legend(handles=[a,b])
import matplotlib.dates as mdates
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))
fname = '../images/RainyLakeInflows.png'
plt.savefig(fname)
!convert $fname -trim $fname
!convert $fname -transparent white $fname
%%capture capt
%run -i Predicted_Effect_of_Rule_Curve_Changes_on_Rainy_River_Flows.ipynb
areaNL = 25973 * 10000
dNL1970 = 0.5*(NL1970['LRC']+NL1970['URC']).diff()
dNL2000 = 0.5*(NL2000['LRC']+NL2000['URC']).diff()
qNL1970 = -areaNL*dNL1970/(24*3600)
qNL2000 = -areaNL*dNL2000/(24*3600)
dqNL = qNL2000 - qNL1970
plt.figure(figsize=(12,6))
plt.hold(True)
(I[range(2000,2011)].mean(axis=1)-I[range(1971,2000)].mean(axis=1)).plot()
plt.fill_between(dqNL.index,0, dqNL.tolist(), color='b', alpha='0.4')
plt.hold(False)
plt.title('Change in Mean Rainy Lake Inflow, 1971-1999 versus 2000-2011')
plt.ylabel('cubic meters/sec')
plt.gca().set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun',
'Jul','Aug','Sep','Oct','Nov','Dec']);
fname = '../images/ChangeInMeanInflow.png'
plt.savefig(fname)
!convert $fname -trim $fname
!convert $fname -transparent white $fname
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-19-df2c014f9e30> in <module>() 1 areaNL = 25973 * 10000 2 ----> 3 dNL1970 = 0.5*(NL1970['LRC']+NL1970['URC']).diff() 4 dNL2000 = 0.5*(NL2000['LRC']+NL2000['URC']).diff() 5 NameError: name 'NL1970' is not defined
RLInflows = pd.read_pickle('./data/RLInflows.pkl')
df = pd.concat([RLInflows,RL['1971':'2010']],axis=1)
df.columns=['I','H']
df['I'].plot(linewidth=0.5)
plt.hold(True)
df['H'].plot(linewidth=2)
plt.hold(False)
plt.ylim(0,2400)
plt.scatter(df.ix['1971':'1999']['I'],df.ix['1971':'1999']['H'],color='b',alpha=0.5)
plt.hold(True)
plt.scatter(df.ix['2000':'2010']['I'],df.ix['2000':'2010']['H'],color='r',alpha=0.5)
ax = plt.axis()
plt.plot([ax[0],ax[1]],[337.90,337.90],'y--')
plt.plot([ax[0],ax[1]],[337.75,337.75],'y--')
plt.plot([ax[0],ax[1]],[337.20,337.20],'y--')
plt.plot([ax[0],ax[1]],[336.70,336.70],'y--')
plt.xlim(ax[0],ax[1])
plt.hold(False)
# Load inflow data
RLInflows = pd.read_pickle('./data/RLInflows.pkl')
def plotInflowData(Q):
plt.figure(figsize=(10,3.5))
Q.plot(linewidth=0.75)
plt.title('Rainy Lake Inflows {0}-{1}'.format(
Q.index[0].strftime('%Y'),
Q.index[-1].strftime('%Y')))
plt.ylabel('cubic meters/sec');
plotInflowData(RLInflows)
plotInflowData(RLInflows['1980':'1985'])
RLInflows.hist(bins=np.arange(-100,2500,20),alpha=0.5)
from pandas.tools.plotting import lag_plot
lag_plot(RLInflows,lag=1)
plt.axis('equal')
plt.hold(True)
plt.plot([RLInflows.min(),RLInflows.max()],[RLInflows.min(),RLInflows.max()],'r--')
plt.hold(False)
plt.title('Lag Plot of Daily Rainy Lake Inflows {0}-{1}'.format(
RLInflows.index.min().strftime('%Y'),
RLInflows.index.max().strftime('%Y')))
plt.xlabel('Inflow(t) cubic meters/sec')
plt.ylabel('Inflow(t+1) cubic meters/sec');
The augmented Dickey-Fuller test is used to test for a unit root in a univariate process.
import statsmodels.api as sm
(adf, pvalue, usedlag, nobs, cvalues, icbtest) = sm.tsa.adfuller(RLInflows)
print adf
print pvalue
-11.7600388434 1.15381832842e-21
The test statistic is strongly negative, and we can soundly reject a unit root and therefore a the non-stationary hypothesis. This was also abundantly clear from the original data.
Next we consider autocorrelations
import statsmodels.graphics.tsaplots as tsaplots
plt.hold(True)
plt.close()
tsaplots.plot_acf(RLInflows,lags = 120);
plt.hold(False)
plt.figure()
plt.hold(True)
plt.close()
tsaplots.plot_pacf(RLInflows,lags = 20);
plt.hold(False)
sm.tsa.arma_order_select_ic(RLInflows.tolist(), ic=['aic', 'bic'], trend='nc')
{'aic': 0 1 2 0 NaN 194948.116327 181328.016361 1 149695.677261 148697.659247 148480.532504 2 149022.025497 148564.607694 148370.097388 3 148400.087858 148394.768405 148364.041088 4 148398.412531 148382.398248 148356.413373, 'aic_min_order': (4, 2), 'bic': 0 1 2 0 NaN 194963.295250 181350.784745 1 149710.856184 148720.427632 148510.890350 2 149044.793881 148594.965540 148408.044696 3 148430.445704 148432.715713 148409.577857 4 148436.359838 148427.935017 148409.539604, 'bic_min_order': (2, 2)}
arma_mod = sm.tsa.ARMA(RLInflows.tolist(), (4,2)).fit()
print "Coeff Value Std. Err. pvalue"
K = arma_mod.k_trend
k = 0
j = 0
while k < K:
print 'tr[{0}] = {1:9.4f} {2:7.4f} {3:7.4f}'.format(
j, arma_mod.params[k], arma_mod.bse[k], arma_mod.pvalues[k])
j += 1
k += 1
K += arma_mod.k_exog
j = 0
while k < K:
print 'ex[{0}] = {1:9.4f} {2:7.4f} {3:7.4f}'.format(
j, arma_mod.params[k], arma_mod.bse[k], arma_mod.pvalues[k])
j += 1
k += 1
print ''
K += arma_mod.k_ar
j = 0
while k < K:
print 'ar[{0}] = {1:9.4f} {2:7.4f} {3:7.4f}'.format(
j, arma_mod.params[k], arma_mod.bse[k], arma_mod.pvalues[k])
j += 1
k += 1
print ''
K += arma_mod.k_ma
j = 1
while k < K:
print 'ma[{0}] = {1:9.4f} {2:7.4f} {3:7.4f}'.format(
j, arma_mod.params[k], arma_mod.bse[k], arma_mod.pvalues[k])
j += 1
k += 1
print ''
Coeff Value Std. Err. pvalue tr[0] = 285.3931 21.4288 0.0000 ar[0] = 0.9656 0.1347 0.0000 ar[1] = 0.1292 0.1662 0.4369 ar[2] = -0.2280 0.0740 0.0021 ar[3] = 0.1179 0.0300 0.0001 ma[1] = 0.2749 0.1344 0.0409 ma[2] = -0.2501 0.0371 0.0000
RLPred = pd.Series(arma_mod.predict(1,len(RLInflows),dynamic=False),
index=RLInflows.index)
yrA = '1971'
yrB = '1975'
RLInflows[yrA:yrB].plot(linewidth=0.5)
plt.hold(True)
RLPred[yrA:yrB].plot(linewidth=0.5)
(RLPred[yrA:yrB]-RLInflows[yrA:yrB]).plot(linewidth=0.5)
plt.ylabel('cubic meters/sec')
plt.hold(False)
r = pd.Series(arma_mod.resid,index=RLInflows.index).copy()
plt.figure(figsize=(10,3.5))
r.ix[0:100].plot(linewidth=1)
import scipy.signal
a = np.concatenate((np.array([1]),-arma_mod.arparams))
b = np.concatenate((np.array([1]),arma_mod.maparams))
f = pd.Series(scipy.signal.lfilter(a,b,r) ,index = RLInflows.index)
f.ix[0:100].plot(linewidth=1)
plt.hold(True)
r.mean()
-0.0061535850319375911
sm.stats.durbin_watson(r)
1.9982804647252388
from scipy import stats
stats.normaltest(r)
(11940.331574076443, 0.0)
The very low value returned from scipy.stats.normaltest()
tells us that the residuals are not Normally distributed.
rmean = pd.Series([d.mean() for t,d in f.groupby(f.index.dayofyear)])
rmean_smooth = pd.rolling_window(
pd.concat([rmean.tshift(-len(rmean),freq=1),rmean,rmean.tshift(len(rmean),freq=1)]),
30,'triang',center=True)[rmean.index]
rstd = pd.Series([d.std() for t,d in f.groupby(f.index.dayofyear)])
rstd_smooth = pd.rolling_window(
pd.concat([rstd.tshift(-len(rstd),freq=1),rstd,rstd.tshift(len(rstd),freq=1)]),
30,'triang',center=True)[rstd.index]
plt.subplot(2,1,1)
plt.hold(True)
rmean.plot(linewidth=1)
rmean_smooth.plot(style='r')
plt.xlabel('Day of Year')
plt.ylabel('cubic meters/sec')
plt.title('Residual Mean')
plt.hold(False)
plt.subplot(2,1,2)
plt.hold(True)
rstd.plot(linewidth=1)
rstd_smooth.plot(style='r')
plt.xlabel('Day of Year')
plt.ylabel('cubic meters/sec')
plt.title('Residual Standard Deviation')
plt.hold(False)
plt.tight_layout()
s = r.copy()
for k in s.index:
s.ix[k] = (s.ix[k] - rmean_smooth.ix[k.dayofyear-1])
s.ix[k] = s.ix[k]/rstd_smooth.ix[k.dayofyear-1]
plt.figure(figsize=(12,3.5))
s.plot(linewidth=0.5);
plt.title('Residuals')
plt.ylabel('cubic meters/sec')
print stats.normaltest(s)
from scipy import stats
df,loc,scale = stats.t.fit(s)
dist = stats.t(df,loc=loc,scale=scale)
plt.figure(figsize=(10.2,3.5))
plt.subplot(1,3,1)
s.hist(bins=200,normed=True)
plt.hold(True)
xmin,xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
plt.plot(x, dist.pdf(x), 'r', linewidth=2)
plt.xlim(-6,6)
plt.title('Probability Density Function')
plt.xlabel('Residual')
plt.ylabel('Probability')
plt.legend(['t-distribution','Data'],loc='upper left')
plt.hold(False)
print df,loc,scale
plt.subplot(1,3,2)
y = [v for v in sorted(s.tolist())]
x = [dist.ppf(float(k)/float(1+len(s))) for k in range(1,len(s)+1)]
plt.hold(True)
plt.scatter(x,y)
plt.plot([min(x),max(x)],[min(x),max(x)],'r')
plt.axis('equal')
plt.xlabel('Fitted Response')
plt.ylabel('Ordered Response')
plt.title('Probability Plot')
plt.hold(False)
plt.subplot(1,3,3)
lag_plot(s)
plt.axis('equal')
plt.title('Lag Plot')
plt.xlabel('s(k)')
plt.ylabel('s(k+1)')
plt.tight_layout()
(9312.1470634169891, 0.0) 2.83007578944 -0.0531931405403 0.362534019742
Removing the seasonality reduced the chi-squared statistic, but it is still clearly a non-Normal distribution.
#e = pd.Series(dist.rvs(len(s)), index = s.index)
e = pd.Series(0, index = s.index)
plt.figure(figsize=(12,7))
for k in e.index:
e.ix[k] = random.choice(s)
plt.subplot(2,1,1)
e.plot(linewidth=0.5)
for k in e.index:
e.ix[k] = e.ix[k]*rstd_smooth.ix[k.dayofyear-1]
#e.ix[k] = random.choice(s)*rstd_smooth.ix[k.dayofyear-1]
e.ix[k] += rmean_smooth.ix[k.dayofyear-1]
plt.subplot(2,1,2)
e.plot(linewidth=0.5)
import scipy.signal
a = np.concatenate((np.array([1]),-arma_mod.arparams))
b = np.concatenate((np.array([1]),arma_mod.maparams))
f = pd.Series(scipy.signal.lfilter(b,a,e) + arma_mod.params[0],index = e.index)
plotInflowData(f)
f['1971':'1975'].plot(linewidth=0.5)
plt.hold(True)
#RLInflows['1981':'1981'].plot()
#(f-RLInflows['1981':'1981']).plot()
#e.plot(linewidth=0.5)
plt.hold(False)
print f.mean()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-28-b03c9fed85a8> in <module>() 4 5 for k in e.index: ----> 6 e.ix[k] = random.choice(s) 7 8 plt.subplot(2,1,1) NameError: name 'random' is not defined
<matplotlib.figure.Figure at 0x117241910>
plotInflowData(f)
#N = 5001
#e = pd.Series(dist.rvs(N), index = pd.date_range(pd.datetime.today().date(), periods=N))
#e = s['1981':'1981'].copy()
#for k in e.index:
# e.ix[k] = e.ix[k]*rstd_smooth.ix[k.dayofyear-1]
# e.ix[k] += rmean_smooth.ix[k.dayofyear-1]
import scipy.signal
a = np.concatenate((np.array([1]),-arma_mod.arparams))
b = np.concatenate((np.array([1]),arma_mod.maparams))
f = pd.Series(scipy.signal.lfilter(b,a,e) + arma_mod.params[0],index = e.index)
f['1971':'1975'].plot(linewidth=0.5)
plt.hold(True)
#RLInflows['1981':'1981'].plot()
#(f-RLInflows['1981':'1981']).plot()
#e.plot(linewidth=0.5)
plt.hold(False)
print f.mean()
import scipy.signal
a = np.concatenate((np.array([1]),-arma_mod.arparams))
b = np.concatenate((np.array([1]),arma_mod.maparams))
y = RLInflows.tolist()
e = scipy.signal.lfilter(a,b,y)
e = pd.Series(e,index=RLInflows.index)
e['1986'].plot(linewidth=0.5)
plt.hold(True)
tsaplots.plot_acf(e,lags = 100);
plt.hold(False)
e = e - e.mean()
ep = e.ix[e >= 0]
en = -e.ix[e <= 0]
en.plot()
en.hist(bins=range(0,300),normed=True,color='g',alpha=0.5)
plt.hold(True)
ep.hist(bins=range(0,300),color='r',alpha=0.5,normed=True)
plt.hold(False)
plt.xlim(0,200)
from pandas.tools.plotting import lag_plot
lag_plot(RLInflows['1971':'1999'].diff(),color='b',alpha=0.4)
plt.hold(True)
lag_plot(RLInflows['2000':'2010'].diff(),color='r',alpha=0.4)
plt.hold(False)
plt.axis('equal')
RLInflows['1971':'1999'].diff().hist(bins=np.arange(-400,1200,20),color='b',alpha=0.5,normed=True)
plt.hold(True)
RLInflows['2000':'2011'].diff().hist(bins=np.arange(-400,1200,20),color='r',alpha=0.5,normed=True)
plt.hold(False)
x['I'].plot()
KINL = pd.read_pickle(dir+'KINL.pkl')
plt.hold(True)
(10*KINL[yrA:yrB]).plot()
x['I'].plot()
plt.hold(False)