#!/usr/bin/env python # coding: utf-8 # In[1]: from preamble import * get_ipython().run_line_magic('', 'matplotlib notebook') # # Gaussian Processes # # Check out http://scikit-learn.org/dev/modules/gaussian_process.html # ### download data from https://s3.amazonaws.com/tripdata/201307-201402-citibike-tripdata.zip # In[2]: from glob import glob dfs = [] for file in glob("data/citibike/*.csv"): dfs.append(pd.read_csv(file)) # In[3]: data = pd.concat(dfs) # In[4]: data.columns # In[5]: data.head() # In[6]: data['one'] = 1 data['starttime'] = pd.to_datetime(data.starttime) data = data.set_index("starttime") # In[7]: data_resampled = data.groupby("start station id").one.resample("3h").sum() # In[8]: per_station = data_resampled.unstack(level=0).fillna(0) # In[9]: plt.figure() per_station[301].plot() # In[10]: from sklearn.gaussian_process import GaussianProcessRegressor # In[11]: y = per_station[301].values X = np.arange(len(y)).reshape(-1, 1) # In[12]: gp = GaussianProcessRegressor().fit(X, y) # In[13]: plt.figure() plt.plot(y, label="y") plt.plot(gp.predict(X), label="preds") plt.legend() # In[14]: gp.kernel_ # In[18]: from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel gp = GaussianProcessRegressor(alpha=1, normalize_y=True, kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000)) + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed") # + 1.0 * WhiteKernel(noise_level=1) + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500]) # In[20]: plt.figure() plt.plot(y, label="y") plt.plot(gp.predict(X), label="preds") plt.legend() # In[21]: gp.kernel_ # # Exercise # Pick a subset of stations from a particular area of the city. Can you use location information to improve the estimates? Can you make predictions for a station given on other stations?