#!/usr/bin/env python
# coding: utf-8

# In[1]:


from preamble import *
get_ipython().run_line_magic('', 'matplotlib notebook')


# # Gaussian Processes
# # Check out http://scikit-learn.org/dev/modules/gaussian_process.html

# ### download data from https://s3.amazonaws.com/tripdata/201307-201402-citibike-tripdata.zip

# In[2]:


from glob import glob
dfs = []
for file in glob("data/citibike/*.csv"):
    dfs.append(pd.read_csv(file))


# In[3]:


data = pd.concat(dfs)


# In[4]:


data.columns


# In[5]:


data.head()


# In[6]:


data['one'] = 1
data['starttime'] = pd.to_datetime(data.starttime)
data = data.set_index("starttime")


# In[7]:


data_resampled = data.groupby("start station id").one.resample("3h").sum()


# In[8]:


per_station = data_resampled.unstack(level=0).fillna(0)


# In[9]:


plt.figure()
per_station[301].plot()


# In[10]:


from sklearn.gaussian_process import GaussianProcessRegressor


# In[11]:


y = per_station[301].values
X = np.arange(len(y)).reshape(-1, 1)


# In[12]:


gp = GaussianProcessRegressor().fit(X, y)


# In[13]:


plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()


# In[14]:


gp.kernel_


# In[18]:


from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel
gp = GaussianProcessRegressor(alpha=1, normalize_y=True,
                              kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000))
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed")  # + 1.0 * WhiteKernel(noise_level=1)
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500])


# In[20]:


plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()


# In[21]:


gp.kernel_


# # Exercise
# Pick a subset of stations from a particular area of the city. Can you use location information to improve the estimates? Can you make predictions for a station given on other stations?