%matplotlib inline
%config InlineBackend.figure_format='retina'
import dask.dataframe as dd
import dask.distributed
import numpy as np
import pandas as pd
# import geopandas as gpd
from matplotlib.colors import SymLogNorm as symlog
from matplotlib import rcParams
import sklearn, sklearn.cluster
import matplotlib.pyplot as plt
import palettable
import seaborn as sns
import netCDF4
import geopandas
pd.options.display.max_rows = 300
pd.options.display.max_columns = 100
client = dask.distributed.Client()
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')
rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'
rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import seaborn as sns
import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaOblique.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'HelveticaBold.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file='
'{0}/Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
'normal:weight=normal:stretch=normal:file={0}/'
'Helvetica.ttf:' +
fontsize2
).format(fontdirpath))
df = dd.read_parquet('/data/all_trips.parquet', index='trip_id',
columns='pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id'.split())
/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6 warnings.warn('Regression warning: found category spec from '
df2 = df.sample(frac=1.0e-6, random_state=42).compute()
df2 = df2.dropna()
df3 = df2.merge(
tzdf['LocationID borough zone'.split()], left_on='pickup_taxizone_id', right_on='LocationID'
)
df3['pickup_location'] = df3.borough.map(str) + " | " + df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.merge(
tzdf['LocationID borough zone'.split()], left_on='dropoff_taxizone_id', right_on='LocationID'
)
df3['dropoff_location'] = df3.borough.map(str) + " | " + df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.sample(frac=1, replace=False, random_state=42).reset_index(drop=True)
df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html().replace("""\n""", "")
'<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>pickup_datetime</th> <th>dropoff_datetime</th> <th>pickup_taxizone_id</th> <th>dropoff_taxizone_id</th> <th>pickup_location</th> <th>dropoff_location</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>2009-04-02 15:02:43</td> <td>2009-04-02 15:22:45</td> <td>138</td> <td>229</td> <td>Queens | LaGuardia Airport</td> <td>Manhattan | Sutton Place/Turtle Bay North</td> </tr> <tr> <th>1</th> <td>2009-05-03 01:34:00</td> <td>2009-05-03 01:36:00</td> <td>79</td> <td>164</td> <td>Manhattan | East Village</td> <td>Manhattan | Midtown South</td> </tr> <tr> <th>2</th> <td>2010-10-09 00:48:38</td> <td>2010-10-09 01:00:55</td> <td>79</td> <td>237</td> <td>Manhattan | East Village</td> <td>Manhattan | Upper East Side South</td> </tr> <tr> <th>3</th> <td>2011-09-07 10:03:00</td> <td>2011-09-07 10:09:00</td> <td>113</td> <td>211</td> <td>Manhattan | Greenwich Village North</td> <td>Manhattan | SoHo</td> </tr> <tr> <th>4</th> <td>2012-07-21 20:34:00</td> <td>2012-07-21 20:39:00</td> <td>211</td> <td>114</td> <td>Manhattan | SoHo</td> <td>Manhattan | Greenwich Village South</td> </tr> <tr> <th>5</th> <td>2013-05-31 17:45:06</td> <td>2013-05-31 18:41:07</td> <td>138</td> <td>25</td> <td>Queens | LaGuardia Airport</td> <td>Brooklyn | Boerum Hill</td> </tr> <tr> <th>6</th> <td>2014-08-08 09:06:15</td> <td>2014-08-08 09:14:52</td> <td>230</td> <td>100</td> <td>Manhattan | Times Sq/Theatre District</td> <td>Manhattan | Garment District</td> </tr> <tr> <th>7</th> <td>2015-03-28 05:51:31</td> <td>2015-03-28 06:00:05</td> <td>263</td> <td>107</td> <td>Manhattan | Yorkville West</td> <td>Manhattan | Gramercy</td> </tr> <tr> <th>8</th> <td>2015-10-11 10:20:04</td> <td>2015-10-11 10:31:45</td> <td>75</td> <td>194</td> <td>Manhattan | East Harlem South</td> <td>Manhattan | Randalls Island</td> </tr> <tr> <th>9</th> <td>2016-07-15 00:57:44</td> <td>2016-07-15 01:03:16</td> <td>125</td> <td>186</td> <td>Manhattan | Hudson Sq</td> <td>Manhattan | Penn Station/Madison Sq West</td> </tr> </tbody></table>'
from IPython.display import HTML
HTML(df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html())
pickup_datetime | dropoff_datetime | pickup_taxizone_id | dropoff_taxizone_id | pickup_location | dropoff_location | |
---|---|---|---|---|---|---|
0 | 2009-04-02 15:02:43 | 2009-04-02 15:22:45 | 138 | 229 | Queens | LaGuardia Airport | Manhattan | Sutton Place/Turtle Bay North |
1 | 2009-05-03 01:34:00 | 2009-05-03 01:36:00 | 79 | 164 | Manhattan | East Village | Manhattan | Midtown South |
2 | 2010-10-09 00:48:38 | 2010-10-09 01:00:55 | 79 | 237 | Manhattan | East Village | Manhattan | Upper East Side South |
3 | 2011-09-07 10:03:00 | 2011-09-07 10:09:00 | 113 | 211 | Manhattan | Greenwich Village North | Manhattan | SoHo |
4 | 2012-07-21 20:34:00 | 2012-07-21 20:39:00 | 211 | 114 | Manhattan | SoHo | Manhattan | Greenwich Village South |
5 | 2013-05-31 17:45:06 | 2013-05-31 18:41:07 | 138 | 25 | Queens | LaGuardia Airport | Brooklyn | Boerum Hill |
6 | 2014-08-08 09:06:15 | 2014-08-08 09:14:52 | 230 | 100 | Manhattan | Times Sq/Theatre District | Manhattan | Garment District |
7 | 2015-03-28 05:51:31 | 2015-03-28 06:00:05 | 263 | 107 | Manhattan | Yorkville West | Manhattan | Gramercy |
8 | 2015-10-11 10:20:04 | 2015-10-11 10:31:45 | 75 | 194 | Manhattan | East Harlem South | Manhattan | Randalls Island |
9 | 2016-07-15 00:57:44 | 2016-07-15 01:03:16 | 125 | 186 | Manhattan | Hudson Sq | Manhattan | Penn Station/Madison Sq West |
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['pickup_taxizone_id', 'dropoff_taxizone_id'])
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)
/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6 warnings.warn('Regression warning: found category spec from '
df.head()
pickup_taxizone_id | dropoff_taxizone_id | |
---|---|---|
pickup_datetime | ||
2009-01-01 00:00:00 | 237 | 263 |
2009-01-01 00:00:00 | 114 | 249 |
2009-01-01 00:00:02 | 237 | 43 |
2009-01-01 00:00:04 | 261 | 261 |
2009-01-01 00:00:07 | 144 | 80 |
count_dataframe = df.reset_index().groupby(['pickup_taxizone_id', 'dropoff_taxizone_id']).count().compute()
count_dataframe.columns = ['count']
count_dataframe.shape
(60710, 1)
count_dataframe.head()
count | ||
---|---|---|
pickup_taxizone_id | dropoff_taxizone_id | |
1 | 1 | 101493 |
186 | 35 | |
266 | 37311 | |
2 | 2 | 4297 |
14 | 20 |
count_matrix = np.zeros((267, 267), dtype=np.int64)
for r in count_dataframe.reset_index().itertuples():
count_matrix[r[1], r[2]] = r[3]
count_dataframe.describe()
count | |
---|---|
count | 6.071000e+04 |
mean | 2.277015e+04 |
std | 1.698617e+05 |
min | 1.000000e+00 |
25% | 9.000000e+00 |
50% | 8.800000e+01 |
75% | 1.258000e+03 |
max | 2.556999e+07 |
count_dataframe.reset_index().head()
pickup_taxizone_id | dropoff_taxizone_id | count | |
---|---|---|---|
0 | 1 | 1 | 101493 |
1 | 1 | 186 | 35 |
2 | 1 | 266 | 37311 |
3 | 2 | 2 | 4297 |
4 | 2 | 14 | 20 |
# <!-- collapse=True -->
plt.imshow(count_matrix[1:-3, 1:-3].T, norm=symlog(10000), origin='upper', cmap=plt.cm.Blues)
plt.grid(False)
plt.xlabel("Dropoff Taxi Zone ID")
plt.ylabel("Pickup Taxi Zone ID")
plt.gcf().set_size_inches(4, 4)
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['pickup_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6 warnings.warn('Regression warning: found category spec from '
def get_year_mo_day(data, col):
# d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
# return d
return data.index.values.astype('M8[h]')
df['pickup_ymd'] = df.map_partitions(get_year_mo_day, 'pickup_datetime', meta=('asdf', np.datetime64))
/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead. return pd.Series([], dtype=dtype, name=name, index=index)
df.reset_index().rename(columns=dict(index='N')).tail()
N | pickup_taxizone_id | pickup_ymd | |
---|---|---|---|
1432504 | 2016-12-31 23:59:57 | 36 | 2016-12-31 23:00:00 |
1432505 | 2016-12-31 23:59:58 | 76 | 2016-12-31 23:00:00 |
1432506 | 2016-12-31 23:59:58 | 168 | 2016-12-31 23:00:00 |
1432507 | 2016-12-31 23:59:58 | 144 | 2016-12-31 23:00:00 |
1432508 | 2016-12-31 23:59:59 | 135 | 2016-12-31 23:00:00 |
pickup_counts_df = df.reset_index().rename(columns=dict(index='N')).groupby(['pickup_taxizone_id', 'pickup_ymd',]).count().compute()
pickup_counts_df.sort_index(inplace=True)
pickup_counts_df.head()
N | ||
---|---|---|
pickup_taxizone_id | pickup_ymd | |
1 | 2009-01-01 01:00:00 | 1 |
2009-01-01 02:00:00 | 1 | |
2009-01-01 04:00:00 | 2 | |
2009-01-01 05:00:00 | 1 | |
2009-01-01 07:00:00 | 1 |
z = pickup_counts_df.unstack(0)
z.columns = np.arange(1, 267).astype(str)
z = z.merge(
pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')),
how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)
z.head()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | ... | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2009-01-01 00:00:00 | 0 | 0 | 0 | 121 | 0 | 0 | 42 | 0 | 0 | 0 | 0 | 4 | 63 | 2 | 0 | 0 | 13 | 1 | 1 | 0 | 0 | 1 | 3 | 78 | 28 | 0 | 0 | 3 | 0 | 0 | 0 | 1 | 29 | 1 | 0 | 3 | 16 | 0 | 0 | 27 | 53 | 30 | 207 | 0 | 21 | 0 | 0 | 594 | 18 | 178 | ... | 2 | 0 | 0 | 0 | 0 | 0 | 18 | 71 | 5 | 35 | 1 | 1 | 379 | 57 | 328 | 70 | 211 | 544 | 1 | 384 | 459 | 407 | 439 | 0 | 3 | 0 | 10 | 32 | 0 | 211 | 1 | 2 | 409 | 0 | 0 | 0 | 0 | 0 | 74 | 53 | 4 | 2 | 0 | 6 | 70 | 179 | 449 | 0 | 0 | 369 |
2009-01-01 01:00:00 | 1 | 0 | 0 | 137 | 0 | 0 | 88 | 0 | 0 | 0 | 0 | 4 | 63 | 2 | 0 | 1 | 11 | 1 | 0 | 0 | 0 | 0 | 5 | 84 | 51 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 78 | 2 | 0 | 8 | 22 | 0 | 0 | 37 | 79 | 41 | 103 | 0 | 39 | 0 | 0 | 440 | 37 | 184 | ... | 7 | 0 | 0 | 2 | 0 | 0 | 35 | 121 | 10 | 68 | 0 | 8 | 540 | 130 | 360 | 131 | 298 | 402 | 1 | 382 | 459 | 486 | 549 | 0 | 1 | 4 | 24 | 64 | 0 | 167 | 3 | 0 | 322 | 1 | 0 | 0 | 0 | 0 | 97 | 92 | 7 | 0 | 0 | 22 | 85 | 239 | 627 | 0 | 0 | 452 |
2009-01-01 02:00:00 | 1 | 0 | 0 | 125 | 0 | 0 | 115 | 0 | 0 | 0 | 0 | 2 | 58 | 4 | 0 | 2 | 19 | 1 | 0 | 0 | 0 | 0 | 4 | 90 | 54 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 54 | 0 | 0 | 4 | 15 | 0 | 0 | 54 | 99 | 50 | 121 | 0 | 47 | 0 | 0 | 486 | 50 | 130 | ... | 11 | 0 | 1 | 3 | 0 | 0 | 40 | 119 | 6 | 69 | 1 | 10 | 496 | 155 | 348 | 115 | 288 | 295 | 1 | 329 | 267 | 401 | 491 | 0 | 1 | 0 | 28 | 71 | 0 | 117 | 4 | 0 | 335 | 0 | 0 | 1 | 0 | 0 | 122 | 115 | 6 | 1 | 0 | 21 | 55 | 170 | 594 | 0 | 0 | 378 |
2009-01-01 03:00:00 | 0 | 0 | 0 | 124 | 0 | 0 | 110 | 1 | 1 | 0 | 0 | 1 | 33 | 8 | 0 | 3 | 18 | 0 | 0 | 0 | 0 | 2 | 3 | 51 | 61 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 55 | 1 | 1 | 8 | 22 | 1 | 0 | 35 | 67 | 44 | 67 | 0 | 40 | 0 | 0 | 541 | 42 | 129 | ... | 9 | 0 | 0 | 5 | 0 | 0 | 34 | 74 | 4 | 53 | 2 | 9 | 349 | 303 | 322 | 80 | 144 | 348 | 1 | 220 | 169 | 199 | 339 | 0 | 0 | 1 | 15 | 61 | 0 | 115 | 2 | 0 | 261 | 0 | 1 | 0 | 0 | 0 | 80 | 98 | 5 | 1 | 0 | 28 | 70 | 96 | 437 | 0 | 0 | 313 |
2009-01-01 04:00:00 | 2 | 0 | 0 | 100 | 0 | 0 | 105 | 0 | 1 | 0 | 0 | 0 | 19 | 7 | 0 | 4 | 8 | 2 | 3 | 1 | 0 | 1 | 0 | 41 | 43 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 27 | 3 | 0 | 7 | 24 | 0 | 0 | 20 | 54 | 28 | 34 | 0 | 38 | 0 | 1 | 390 | 27 | 120 | ... | 3 | 0 | 0 | 5 | 0 | 0 | 29 | 34 | 3 | 59 | 1 | 7 | 233 | 308 | 221 | 53 | 116 | 286 | 3 | 91 | 79 | 112 | 186 | 0 | 0 | 0 | 10 | 27 | 0 | 69 | 6 | 1 | 241 | 0 | 0 | 0 | 0 | 0 | 80 | 74 | 5 | 0 | 0 | 20 | 37 | 62 | 296 | 0 | 0 | 224 |
5 rows × 266 columns
import fastparquet
fastparquet.write('/data/trips_pickups_matrix.parquet', z, compression='SNAPPY')
df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)
/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6 warnings.warn('Regression warning: found category spec from '
def get_year_mo_day(data, col):
# d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
# return d
return data.index.values.astype('M8[h]')
df['dropoff_ymd'] = df.map_partitions(get_year_mo_day, 'dropoff_datetime', meta=('asdf', np.datetime64))
/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead. return pd.Series([], dtype=dtype, name=name, index=index)
df.reset_index(drop=True).tail()
dropoff_datetime | dropoff_taxizone_id | dropoff_ymd | |
---|---|---|---|
1432504 | 2017-01-01 00:07:47 | 36 | 2016-12-31 23:00:00 |
1432505 | 2017-01-01 00:15:29 | 63 | 2016-12-31 23:00:00 |
1432506 | 2017-01-01 00:39:07 | 161 | 2016-12-31 23:00:00 |
1432507 | 2017-01-01 00:03:50 | 209 | 2016-12-31 23:00:00 |
1432508 | 2017-01-01 00:14:30 | 134 | 2016-12-31 23:00:00 |
dropoff_counts_df = df.reset_index(drop=True).rename(columns=dict(dropoff_datetime='N')).groupby(['dropoff_taxizone_id', 'dropoff_ymd',]).count().compute()
dropoff_counts_df.sort_index(inplace=True)
dropoff_counts_df.head()
N | ||
---|---|---|
dropoff_taxizone_id | dropoff_ymd | |
1 | 2009-01-01 01:00:00 | 2 |
2009-01-01 02:00:00 | 3 | |
2009-01-01 03:00:00 | 1 | |
2009-01-01 04:00:00 | 10 | |
2009-01-01 05:00:00 | 10 |
z2 = dropoff_counts_df.unstack(0)
z2 = z2.merge(
pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')),
how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)
/home/shekhar/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right) warnings.warn(msg, UserWarning)
z2.columns = np.arange(1, 267).astype(str)
z2.head()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | ... | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2009-01-01 00:00:00 | 0 | 0 | 1 | 128 | 0 | 1 | 113 | 0 | 0 | 1 | 1 | 1 | 111 | 20 | 1 | 8 | 28 | 3 | 1 | 6 | 2 | 1 | 4 | 84 | 36 | 5 | 1 | 6 | 2 | 0 | 0 | 0 | 57 | 4 | 1 | 10 | 23 | 1 | 0 | 32 | 95 | 81 | 95 | 0 | 30 | 0 | 0 | 434 | 44 | 150 | ... | 7 | 3 | 1 | 9 | 0 | 0 | 53 | 111 | 16 | 56 | 9 | 17 | 352 | 50 | 281 | 75 | 291 | 366 | 3 | 432 | 253 | 407 | 404 | 1 | 10 | 2 | 43 | 82 | 0 | 188 | 0 | 3 | 306 | 3 | 0 | 2 | 0 | 0 | 85 | 75 | 13 | 6 | 3 | 36 | 111 | 282 | 450 | 0 | 0 | 379 |
2009-01-01 01:00:00 | 2 | 0 | 2 | 154 | 0 | 1 | 188 | 2 | 1 | 0 | 3 | 1 | 87 | 25 | 4 | 5 | 28 | 4 | 1 | 1 | 2 | 4 | 5 | 96 | 47 | 4 | 1 | 3 | 0 | 0 | 1 | 1 | 104 | 5 | 3 | 8 | 25 | 0 | 0 | 37 | 140 | 91 | 97 | 0 | 39 | 1 | 2 | 402 | 49 | 147 | ... | 10 | 1 | 2 | 24 | 0 | 0 | 89 | 161 | 22 | 94 | 9 | 23 | 372 | 118 | 226 | 96 | 335 | 331 | 8 | 395 | 227 | 434 | 413 | 0 | 9 | 5 | 86 | 106 | 1 | 157 | 2 | 2 | 340 | 2 | 0 | 5 | 0 | 1 | 103 | 84 | 22 | 7 | 7 | 41 | 75 | 346 | 627 | 0 | 0 | 469 |
2009-01-01 02:00:00 | 3 | 0 | 1 | 134 | 1 | 0 | 245 | 0 | 4 | 4 | 3 | 3 | 88 | 27 | 3 | 10 | 41 | 7 | 1 | 7 | 3 | 7 | 4 | 89 | 45 | 5 | 0 | 7 | 2 | 0 | 1 | 3 | 85 | 1 | 3 | 15 | 30 | 2 | 0 | 31 | 150 | 103 | 77 | 0 | 47 | 0 | 1 | 414 | 65 | 121 | ... | 10 | 0 | 1 | 15 | 1 | 0 | 105 | 145 | 26 | 103 | 11 | 24 | 285 | 162 | 210 | 84 | 301 | 243 | 2 | 390 | 160 | 358 | 370 | 0 | 4 | 2 | 71 | 118 | 1 | 115 | 12 | 2 | 283 | 1 | 0 | 3 | 1 | 0 | 92 | 98 | 28 | 3 | 4 | 53 | 66 | 283 | 536 | 0 | 0 | 408 |
2009-01-01 03:00:00 | 1 | 0 | 1 | 109 | 0 | 0 | 229 | 0 | 4 | 1 | 3 | 1 | 69 | 40 | 2 | 7 | 31 | 7 | 3 | 9 | 5 | 13 | 6 | 86 | 55 | 3 | 0 | 9 | 2 | 1 | 1 | 1 | 69 | 2 | 1 | 21 | 46 | 0 | 0 | 33 | 113 | 100 | 50 | 0 | 33 | 0 | 3 | 368 | 69 | 143 | ... | 9 | 0 | 1 | 19 | 0 | 1 | 107 | 113 | 16 | 84 | 13 | 33 | 245 | 197 | 171 | 85 | 215 | 204 | 3 | 267 | 136 | 245 | 251 | 0 | 6 | 1 | 71 | 105 | 0 | 90 | 5 | 2 | 241 | 0 | 1 | 5 | 0 | 2 | 59 | 81 | 24 | 8 | 5 | 56 | 89 | 233 | 448 | 0 | 0 | 334 |
2009-01-01 04:00:00 | 10 | 0 | 2 | 103 | 0 | 0 | 221 | 1 | 3 | 2 | 2 | 1 | 50 | 38 | 1 | 5 | 32 | 4 | 2 | 2 | 4 | 6 | 1 | 56 | 46 | 2 | 0 | 9 | 1 | 0 | 2 | 2 | 46 | 2 | 3 | 23 | 56 | 1 | 1 | 25 | 97 | 79 | 23 | 0 | 28 | 0 | 0 | 258 | 48 | 98 | ... | 7 | 1 | 0 | 20 | 0 | 0 | 98 | 68 | 24 | 82 | 8 | 26 | 143 | 157 | 96 | 72 | 141 | 110 | 8 | 122 | 71 | 198 | 169 | 3 | 8 | 1 | 52 | 87 | 1 | 39 | 6 | 1 | 171 | 3 | 0 | 4 | 0 | 3 | 62 | 84 | 20 | 4 | 6 | 52 | 68 | 143 | 291 | 0 | 0 | 246 |
5 rows × 266 columns
import fastparquet
fastparquet.write('/data/trips_dropoffs_matrix.parquet', z2, compression='SNAPPY')
tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')
import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()
dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]
counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )
tzdf.zone[0]
'Newark Airport'
sns.distplot(counts_matrix.iloc[:, 263+0], kde=False)
sns.distplot(counts_matrix.iloc[:, 0], kde=False)
<matplotlib.axes._subplots.AxesSubplot at 0x7f049a3dc710>
import sklearn, sklearn.decomposition
# pca = sklearn.decomposition.PCA(n_components=20, whiten=True)
# # pca.fit(counts_matrix.resample('1D').sum().values)
# pca.fit(counts_matrix.values)
# pca.explained_variance_ratio_
pca = sklearn.decomposition.FastICA(n_components=3, random_state=42, whiten=True)
# pca.fit(counts_matrix.resample('1D').sum().values)
yvals = pca.fit_transform(counts_matrix.values)
# pca.explained_variance_ratio_
/home/shekhar/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/fastica_.py:116: UserWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations. warnings.warn('FastICA did not converge. Consider increasing '
yvals.shape
(70128, 3)
pickup_eof1, dropoff_eof1 = pca.components_[0, :263], pca.components_[0, 263:]
pickup_eof2, dropoff_eof2 = pca.components_[1, :263], pca.components_[1, 263:]
pickup_eof3, dropoff_eof3 = pca.components_[2, :263], pca.components_[2, 263:]
# pickup_eof4, dropoff_eof4 = pca.components_[3, :263], pca.components_[3, 263:]
# pickup_eof5, dropoff_eof5 = pca.components_[4, :263], pca.components_[4, 263:]
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
# tzdf['pEOF4'] = pickup_eof4
# tzdf['dEOF4'] = dropoff_eof4
# tzdf['pEOF5'] = pickup_eof5
# tzdf['dEOF5'] = dropoff_eof5
tzdf['N_dropoffs'] = dropoffs_matrix.sum(axis=0).values
tzdf['N_pickups'] = pickups_matrix.sum(axis=0).values
tzdf['log10_N_dropoffs'] = np.log10(tzdf.N_dropoffs)
tzdf['log10_N_pickups'] = np.log10(tzdf.N_pickups)
tzdf = tzdf.to_crs({'init': 'epsg:3857'})
tzdf.head()
LocationID | OBJECTID | Shape_Area | Shape_Leng | borough | geometry | zone | N_dropoffs | N_pickups | log10_N_dropoffs | log10_N_pickups | N_dropoffs_ranked | N_pickups_ranked | pEOF1 | dEOF1 | pEOF2 | dEOF2 | pEOF3 | dEOF3 | pEOF4 | dEOF4 | pEOF5 | dEOF5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0.000782 | 0.116357 | EWR | POLYGON ((-8258175.532737531 4967457.202992616... | Newark Airport | 1801889 | 109695 | 6.255728 | 5.040187 | 0.667939 | 0.469466 | -2.717661e-08 | -2.881359e-09 | -1.794189e-08 | -1.433908e-09 | 5.950944e-08 | 3.195680e-09 | -8.254247e-08 | -5.718085e-10 | -1.040261e-07 | -3.076457e-09 |
1 | 2 | 2 | 0.004866 | 0.433470 | Queens | (POLYGON ((-8217980.621910957 4959237.28547167... | Jamaica Bay | 10387 | 11702 | 4.016490 | 4.068260 | 0.053435 | 0.164122 | 1.004834e-10 | 7.612880e-11 | -4.304754e-11 | -6.442688e-11 | -1.523368e-10 | -1.357631e-10 | 3.737542e-10 | 5.648617e-10 | 3.248668e-10 | 4.355836e-10 |
2 | 3 | 3 | 0.000314 | 0.084341 | Bronx | POLYGON ((-8220713.534155379 4993383.154018582... | Allerton/Pelham Gardens | 96883 | 16547 | 4.986248 | 4.218719 | 0.213740 | 0.202290 | -4.847524e-11 | 4.417117e-11 | 1.598215e-10 | -4.446037e-11 | -2.216636e-09 | -5.661792e-11 | -5.797356e-09 | -8.936931e-10 | -2.671187e-09 | -4.799690e-10 |
3 | 4 | 4 | 0.000112 | 0.043567 | Manhattan | POLYGON ((-8234500.226961648 4971984.093397928... | Alphabet City | 6643997 | 4752651 | 6.822429 | 6.676936 | 0.812977 | 0.805344 | -7.417994e-08 | 2.380392e-07 | -2.521742e-08 | 6.128409e-08 | -1.776279e-07 | -2.114199e-07 | 3.549393e-08 | -5.165418e-07 | 1.502940e-07 | -1.212258e-07 |
4 | 5 | 5 | 0.000498 | 0.092146 | Staten Island | POLYGON ((-8257036.10884249 4948033.094989423,... | Arden Heights | 7340 | 1025 | 3.865696 | 3.010724 | 0.034351 | 0.015267 | -4.787781e-11 | 7.245953e-12 | 6.745541e-12 | -1.782181e-12 | -2.716417e-10 | -1.632698e-11 | -1.791913e-11 | -2.047445e-11 | 3.142611e-11 | -1.017828e-11 |
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_dropoffs_ranked', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
tzdf2.plot(figsize=(12, 18), alpha=1, column='N_pickups_ranked', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
tzdf2.iloc[:, -10:].describe()
pEOF1 | dEOF1 | pEOF2 | dEOF2 | pEOF3 | dEOF3 | pEOF4 | dEOF4 | pEOF5 | dEOF5 | |
---|---|---|---|---|---|---|---|---|---|---|
count | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 | 2.420000e+02 |
mean | 1.337247e-08 | 1.323585e-08 | -3.038017e-08 | -3.044156e-08 | -3.483880e-08 | -3.482810e-08 | -4.837575e-08 | -4.892726e-08 | -3.441977e-08 | -3.494932e-08 |
std | 3.221157e-07 | 2.155272e-07 | 1.011613e-07 | 1.191920e-07 | 1.461272e-07 | 2.128391e-07 | 2.563503e-07 | 4.767413e-07 | 2.825250e-07 | 2.698045e-07 |
min | -1.041723e-06 | -1.178042e-06 | -6.680994e-07 | -7.350545e-07 | -1.130895e-06 | -1.927351e-06 | -1.568877e-06 | -4.064235e-06 | -2.427254e-06 | -1.581324e-06 |
25% | -5.394186e-09 | -5.953277e-10 | -3.959468e-09 | -3.293448e-09 | -5.268074e-08 | -1.307326e-08 | -6.383727e-08 | -2.630257e-08 | -2.527213e-08 | -7.778206e-09 |
50% | -3.556521e-10 | 1.076115e-10 | -7.536052e-11 | -9.318704e-11 | -5.885055e-09 | -2.705909e-10 | -1.180033e-08 | -2.135500e-09 | -4.541858e-09 | -5.239243e-10 |
75% | 4.593931e-09 | 6.834840e-09 | 9.943823e-10 | 5.473768e-11 | -1.525962e-09 | 4.124035e-12 | -4.379473e-10 | 2.358084e-11 | -1.293940e-10 | 3.026373e-10 |
max | 3.253512e-06 | 1.252968e-06 | 2.057265e-07 | 3.911548e-07 | 8.292906e-07 | 7.651686e-07 | 1.753640e-06 | 1.882881e-06 | 1.445455e-06 | 1.378802e-06 |
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 8), alpha=1, column='pEOF1', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
import pysal.esda.mapclassify
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF2', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF3', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.RdBu, edgecolor='k',
linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF4', cmap=plt.cm.RdBu, edgecolor='k',
# linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)
# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.RdBu, edgecolor='k',
# linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)
# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF5', cmap=plt.cm.RdBu, edgecolor='k',
# linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)
# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.RdBu, edgecolor='k',
# linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)
df4 = pd.DataFrame(data=pca.transform(counts_matrix.values)[:, :3], index=counts_matrix.index)
df4.index = df4.index.rename('timepoints')
df4.rename(columns={i:'pc%d' % (i+1) for i in range(3)}, inplace=True)
# df4.reset_index(inplace=True)
df4.plot(lw=1)
plt.xlim('2015-06-22', '2015-06-29')
plt.ylim(-0.02, 0.01)
(-0.02, 0.01)
df4.plot(lw=0.5)
# plt.xlim('2015-06-22', '2015-06-29')
<matplotlib.axes._subplots.AxesSubplot at 0x7f04897ae518>
df4.resample('1M').mean().plot()
df4.resample('1M').std().plot()
# plt.xlim('2015-06-22', '2015-06-29')
<matplotlib.axes._subplots.AxesSubplot at 0x7f04c4441c88>
# df4 = pd.DataFrame(data=pca.transform(counts_matrix.resample('1D').sum().values)[:, :5], index=counts_matrix.resample('1D').sum().index)
# df4.index = df4.index.rename('timepoints')
# df4.rename(columns={i:'pc%d' % i for i in range(5)}, inplace=True)
# # df4.reset_index(inplace=True)
df4.plot()
plt.xlim('2014-04-01', '2014-09-01')
(16161, 16314)
nmf = sklearn.decomposition.NMF(5, random_state=42)
nmf.fit(counts_matrix.resample('1D').sum().values)
# nmf.explained_variance_ratio_
NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200, n_components=5, nls_max_iter=2000, random_state=42, shuffle=False, solver='cd', sparseness=None, tol=0.0001, verbose=0)
nmf.reconstruction_err_
291686.29896657454
pickup_eof1, dropoff_eof1 = nmf.components_[0, :263], nmf.components_[0, 263:]
pickup_eof2, dropoff_eof2 = nmf.components_[1, :263], nmf.components_[1, 263:]
pickup_eof3, dropoff_eof3 = nmf.components_[2, :263], nmf.components_[2, 263:]
pickup_eof4, dropoff_eof4 = nmf.components_[3, :263], nmf.components_[3, 263:]
pickup_eof5, dropoff_eof5 = nmf.components_[4, :263], nmf.components_[4, 263:]
tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['dEOF4'] = dropoff_eof4
tzdf['pEOF5'] = pickup_eof5
tzdf['dEOF5'] = dropoff_eof5
tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]
tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')
tzdf2.iloc[:, -10:].describe()
pEOF1 | dEOF1 | pEOF2 | dEOF2 | pEOF3 | dEOF3 | pEOF4 | dEOF4 | pEOF5 | dEOF5 | |
---|---|---|---|---|---|---|---|---|---|---|
count | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 | 242.000000 |
mean | 33.614941 | 33.744037 | 19.446174 | 19.522582 | 22.824028 | 23.029265 | 12.387393 | 12.510342 | 9.107524 | 8.914849 |
std | 73.643847 | 77.649309 | 39.660845 | 46.353030 | 30.361697 | 36.030383 | 31.689024 | 33.863347 | 18.863313 | 27.253940 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.284592 | 0.012276 | 0.344686 | 0.011539 | 2.180188 | 0.232518 | 0.363029 | 0.000000 | 0.284869 | 0.000000 |
50% | 1.089898 | 0.155107 | 1.118604 | 0.161643 | 8.617385 | 4.500133 | 0.918350 | 0.077347 | 1.153524 | 0.105230 |
75% | 16.729933 | 7.733491 | 16.267112 | 10.980080 | 33.205927 | 29.741598 | 5.186028 | 2.094543 | 7.727718 | 3.108362 |
max | 431.663740 | 436.021587 | 292.790759 | 390.687009 | 141.827463 | 168.425644 | 243.430495 | 213.360773 | 110.659220 | 257.589822 |
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF1', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=430., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=430., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF2', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=292., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=292., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF3', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=168., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=168., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF4', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=113., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=113., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF5', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=257., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.viridis, edgecolor='k',
linewidth=0.5, vmin=0, vmax=257., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
df4 = pd.DataFrame(data=nmf.transform(counts_matrix.resample('1D').sum().values), index=counts_matrix.resample('1D').sum().index)
df5 = df4.reset_index()
df5 = df5.rename(columns={'index':'d', 0: 'pc1', 1: 'pc2', 2:'pc3', 3:'pc4', 4:'pc5'})
import plotnine as p9
(p9.ggplot(df5, p9.aes('d', 'pc1')) + p9.geom_point(color='steelblue', size=.2)) + p9.stat_smooth(
method='lm',size=1)
<ggplot: (8728644232519)>
(p9.ggplot(df5, p9.aes('d', 'pc2')) + p9.geom_point()) + p9.stat_smooth(method='lowess')
/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings. warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (8728654057252)>
(p9.ggplot(df5, p9.aes('d', 'pc3')) + p9.geom_point()) + p9.stat_smooth(method='lowess')
/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings. warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (8728655259824)>
(p9.ggplot(df5, p9.aes('d', 'pc4')) + p9.geom_point()) + p9.stat_smooth(method='lowess')
/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings. warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (-9223363308200697137)>
(p9.ggplot(df5, p9.aes('d', 'pc5')) + p9.geom_point()) + p9.stat_smooth(method='lowess')
/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings. warnings.warn("Confidence intervals are not yet implemented"
<ggplot: (-9223363308201012066)>
dir(sklearn.decomposition)
['DictionaryLearning', 'FactorAnalysis', 'FastICA', 'IncrementalPCA', 'KernelPCA', 'LatentDirichletAllocation', 'MiniBatchDictionaryLearning', 'MiniBatchSparsePCA', 'NMF', 'PCA', 'ProjectedGradientNMF', 'RandomizedPCA', 'SparseCoder', 'SparsePCA', 'TruncatedSVD', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_online_lda', 'base', 'cdnmf_fast', 'dict_learning', 'dict_learning_online', 'factor_analysis', 'fastica', 'fastica_', 'incremental_pca', 'kernel_pca', 'nmf', 'non_negative_factorization', 'online_lda', 'pca', 'randomized_svd', 'sparse_encode', 'sparse_pca', 'truncated_svd']