In [1]:

from preamble import *
% matplotlib notebook

Gaussian Processes¶

Check out http://scikit-learn.org/dev/modules/gaussian_process.html ¶

download data from https://s3.amazonaws.com/tripdata/201307-201402-citibike-tripdata.zip ¶

In [2]:

from glob import glob
dfs = []
for file in glob("data/citibike/*.csv"):
    dfs.append(pd.read_csv(file))

In [3]:

data = pd.concat(dfs)

In [4]:

data.columns

Out[4]:

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender'],
      dtype='object')

In [5]:

data.head()

Out[5]:

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender
0	326	2013-10-01 00:01:08	2013-10-01 00:06:34	239	Willoughby St & Fleet St	40.691966	-73.981302	366	Clinton Ave & Myrtle Ave	40.693261	-73.968896	16052	Subscriber	1982	1
1	729	2013-10-01 00:01:21	2013-10-01 00:13:30	322	Clinton St & Tillary St	40.696192	-73.991218	398	Atlantic Ave & Furman St	40.691652	-73.999979	19412	Customer	\N	0
2	520	2013-10-01 00:01:24	2013-10-01 00:10:04	174	E 25 St & 1 Ave	40.738177	-73.977387	403	E 2 St & 2 Ave	40.725029	-73.990697	19645	Subscriber	1984	1
3	281	2013-10-01 00:01:25	2013-10-01 00:06:06	430	York St & Jay St	40.701485	-73.986569	323	Lawrence St & Willoughby St	40.692362	-73.986317	16992	Subscriber	1985	1
4	196	2013-10-01 00:01:27	2013-10-01 00:04:43	403	E 2 St & 2 Ave	40.725029	-73.990697	401	Allen St & Rivington St	40.720196	-73.989978	15690	Subscriber	1986	1

In [6]:

data['one'] = 1
data['starttime'] = pd.to_datetime(data.starttime)
data = data.set_index("starttime")

In [7]:

data_resampled = data.groupby("start station id").one.resample("3h").sum()

In [8]:

per_station = data_resampled.unstack(level=0).fillna(0)

In [9]:

plt.figure()
per_station[301].plot()

Out[9]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f3fec194128>

In [10]:

from sklearn.gaussian_process import GaussianProcessRegressor

In [11]:

y = per_station[301].values
X = np.arange(len(y)).reshape(-1, 1)

In [12]:

gp = GaussianProcessRegressor().fit(X, y)

In [13]:

plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()

Out[13]:

<matplotlib.legend.Legend at 0x7f3fec060400>

In [14]:

gp.kernel_

Out[14]:

1**2 * RBF(length_scale=1)

In [18]:

from sklearn.gaussian_process.kernels import RBF, ExpSineSquared, WhiteKernel
gp = GaussianProcessRegressor(alpha=1, normalize_y=True,
                              kernel = 1.0 * RBF(length_scale_bounds=(2, 500)) + 1.0 * RBF(length_scale_bounds=(50, 1000))
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=8, periodicity_bounds="fixed")  # + 1.0 * WhiteKernel(noise_level=1)
                              + 1.0 * RBF(length_scale=100, length_scale_bounds=(2, 500)) * ExpSineSquared(periodicity=56, periodicity_bounds="fixed")).fit(X[:1500], y[:1500])

In [20]:

plt.figure()
plt.plot(y, label="y")
plt.plot(gp.predict(X), label="preds")
plt.legend()

Out[20]:

<matplotlib.legend.Legend at 0x7f3feb6c5d68>

In [21]:

gp.kernel_

Out[21]:

1.18**2 * RBF(length_scale=8.34) + 6.15**2 * RBF(length_scale=627) + 5.58**2 * RBF(length_scale=500) * ExpSineSquared(length_scale=0.000298, periodicity=8) + 6.96**2 * RBF(length_scale=37.7) * ExpSineSquared(length_scale=0.0621, periodicity=56)

Exercise¶

Pick a subset of stations from a particular area of the city. Can you use location information to improve the estimates? Can you make predictions for a station given on other stations?

Gaussian Processes¶

Check out http://scikit-learn.org/dev/modules/gaussian_process.html¶

download data from https://s3.amazonaws.com/tripdata/201307-201402-citibike-tripdata.zip¶

Exercise¶

Check out http://scikit-learn.org/dev/modules/gaussian_process.html ¶

download data from https://s3.amazonaws.com/tripdata/201307-201402-citibike-tripdata.zip ¶