#!/usr/bin/env python
# coding: utf-8

# # Ice Out Dates

# Records of 'ice-out' and 'ice-in' days on selected Minnesota lakes are available from the Minnesota DNR at their web site [Ice Out Dates](http://www.dnr.state.mn.us/ice_out/index.html). The customs for determining ice-out and ice-in vary from lake to lake, so it's difficult to compare across different lakes. The DNR does attempt to use the same observers and criteria for each lake so longitudinal data series can provide useful insights.

# ## Initialization

# ### Load Python Modules

# In[1]:


# Display graphics inline with the notebook
get_ipython().run_line_magic('matplotlib', 'inline')

# Standard Python modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
import seaborn as sns
sns.set_context('talk')

# Statistics modules
import statsmodels.api as sm


# In[2]:


# Directory where data files are stored
dir = "../data/"
img = "../images/"


# ## Rainy Lake

# In[3]:


# Read .csv file
RL_ICEOUT = pd.read_csv(dir + 'IceOut_RL.txt',index_col=0,parse_dates=True,skiprows=1);

# Remove duplicated data
RL_ICEOUT = RL_ICEOUT[RL_ICEOUT[' source']=='MNDNR-SCO']

# Drop unneeded columns
RL_ICEOUT.drop(' source',1,inplace=True)
RL_ICEOUT.drop(' comments',1,inplace=True)

# Create a new independent variable that counts years from the first date in the data set
RL_ICEOUT['DOY'] = RL_ICEOUT.index.dayofyear
RL_ICEOUT['n'] = RL_ICEOUT.index.year - RL_ICEOUT.index.year[0]

RL_ICEOUT.head()


# In[4]:


# regression

y = RL_ICEOUT['DOY']
x = RL_ICEOUT['n']

def regress(y,x):
    model = sm.OLS(y,sm.add_constant(x))
    results = model.fit()
    print(results.params)
    print(results.summary())
    
    plt.plot(y.index.year,y)
    plt.plot(y.index.year,results.fittedvalues)
    
    plt.yticks([92,102,112,122,132,142],
        ['Apr 1','Apr 10','Apr 20','May 1','May 10','May 20']);

    ax = plt.gca()

    # standard deviation of the residuals
    Sy = np.sqrt(np.sum(results.resid**2)/(len(x)-2))
    
    # plot the prediction interval (i.e., confidence interval for any particular date)
    from scipy import stats
    t = stats.t.ppf(1-0.025,df=len(x)-results.df_model-1)
    conf = t*Sy*np.sqrt(1 + 1.0/len(x) + (x-x.mean())**2/np.sum((x-x.mean())**2))
    ax.fill_between(y.index.year,results.fittedvalues-conf,results.fittedvalues+conf,alpha=0.2)

    # plot confidence interval of the regression
    conf = t*Sy*np.sqrt(1.0/len(x) + (x-x.mean())**2/np.sum((x-x.mean())**2))
    ax.fill_between(y.index.year,results.fittedvalues-conf,results.fittedvalues+conf,alpha=0.4)

    plt.xlabel('Year')
    plt.ylabel('Day of Year')
  
regress(y,x)

# add labels
plt.title('Rainy Lake Ice Out')

fname = '../images/IceOut_RL.png'
plt.savefig(fname)
get_ipython().system('convert $fname -trim $fname')

# get current axis for use in plotting data from other lakes
rl = plt.gca()


# The relatively long data series for Rainy Lake provides a statistically signficant trend line. The ice-out day is consistently getting earlier and, at the some time, becoming more variable. Thus in the last few years, Rainy Lake has set earliest ice-out records and nearly broken lastest ice-out records.

# ## Kabetogoma Lake

# In[5]:


# Read .csv file
KL_ICEOUT = pd.read_csv(dir + 'IceOut_KL.txt',index_col=0,parse_dates=True,skiprows=1)

# Remove duplicated data
KL_ICEOUT = KL_ICEOUT[KL_ICEOUT[' source']=='MNDNR-SCO']

# Drop unneeded columns
KL_ICEOUT.drop(' source',1,inplace=True)
KL_ICEOUT.drop(' comments',1,inplace=True)

# Determine Trend Line
KL_ICEOUT['DOY'] = KL_ICEOUT.index.dayofyear
KL_ICEOUT['n'] = pd.Series(np.arange(0, len(KL_ICEOUT)), index=KL_ICEOUT.index)

y = KL_ICEOUT['DOY']
x = KL_ICEOUT['n']

regress(y,x)

plt.title('Kabetogoma Lake Ice Out')
plt.ylabel('Day of Year')
plt.xlabel('Year')

plt.axis(rl.axis())

fname = img + 'IceOut_KL.png'

plt.savefig(fname)
get_ipython().system('convert $fname -trim $fname')


# ## Lake of the Woods

# In[6]:


# Read .csv file
LOW_ICEOUT = pd.read_csv(dir + 'IceOut_LOW.txt',index_col=0,parse_dates=True,skiprows=1);

# Remove duplicated data
LOW_ICEOUT = LOW_ICEOUT[LOW_ICEOUT[' source']=='MNDNR-SCO']

# Drop unneeded columns
LOW_ICEOUT.drop(' source',1,inplace=True)
LOW_ICEOUT.drop(' comments',1,inplace=True)

# Determine Trend Line
LOW_ICEOUT['DOY'] = LOW_ICEOUT.index.dayofyear
LOW_ICEOUT['n'] = pd.Series(np.arange(0, len(LOW_ICEOUT)), index=LOW_ICEOUT.index)

y = LOW_ICEOUT['DOY']
x = LOW_ICEOUT['n']

regress(y,x)

plt.title('Lake of the Woods Ice Out')
plt.ylabel('Day of Year')
plt.xlabel('Year')
plt.axis(rl.axis())

fname = img + 'IceOut_LOW.png'

plt.savefig(fname)
get_ipython().system('convert $fname -trim $fname')


# In[ ]: