In [1]:

%matplotlib inline
%config InlineBackend.figure_format='retina'

import dask.dataframe as dd
import dask.distributed
import numpy as np
import pandas as pd
# import geopandas as gpd

from matplotlib.colors import SymLogNorm as symlog
from matplotlib import rcParams

import sklearn, sklearn.cluster
import matplotlib.pyplot as plt
import palettable

import seaborn as sns

import netCDF4
import geopandas


pd.options.display.max_rows = 300
pd.options.display.max_columns = 100

In [2]:

client = dask.distributed.Client()

In [45]:

tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')

In [4]:

rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'

rcParams['savefig.dpi'] = 150
rcParams['figure.dpi'] = 150
import seaborn as sns

import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

Section 1: Sample 10 random rows from the dataset, show with human readable names for blog post as HTML¶

In [5]:

df = dd.read_parquet('/data/all_trips.parquet', index='trip_id', 
    columns='pickup_datetime dropoff_datetime pickup_taxizone_id dropoff_taxizone_id'.split())

/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [6]:

df2 = df.sample(frac=1.0e-6, random_state=42).compute()

In [7]:

df2 = df2.dropna()

In [8]:

df3 = df2.merge(
    tzdf['LocationID borough zone'.split()], left_on='pickup_taxizone_id', right_on='LocationID'
)
df3['pickup_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)

df3 = df3.merge(
    tzdf['LocationID borough zone'.split()], left_on='dropoff_taxizone_id', right_on='LocationID'
)
df3['dropoff_location'] = df3.borough.map(str) + " | " +  df3.zone
df3 = df3.drop('LocationID borough zone'.split(), axis=1)
df3 = df3.sample(frac=1, replace=False, random_state=42).reset_index(drop=True)

In [9]:

df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html().replace("""\n""", "")

Out[9]:

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>pickup_datetime</th>      <th>dropoff_datetime</th>      <th>pickup_taxizone_id</th>      <th>dropoff_taxizone_id</th>      <th>pickup_location</th>      <th>dropoff_location</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>2009-04-02 15:02:43</td>      <td>2009-04-02 15:22:45</td>      <td>138</td>      <td>229</td>      <td>Queens | LaGuardia Airport</td>      <td>Manhattan | Sutton Place/Turtle Bay North</td>    </tr>    <tr>      <th>1</th>      <td>2009-05-03 01:34:00</td>      <td>2009-05-03 01:36:00</td>      <td>79</td>      <td>164</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Midtown South</td>    </tr>    <tr>      <th>2</th>      <td>2010-10-09 00:48:38</td>      <td>2010-10-09 01:00:55</td>      <td>79</td>      <td>237</td>      <td>Manhattan | East Village</td>      <td>Manhattan | Upper East Side South</td>    </tr>    <tr>      <th>3</th>      <td>2011-09-07 10:03:00</td>      <td>2011-09-07 10:09:00</td>      <td>113</td>      <td>211</td>      <td>Manhattan | Greenwich Village North</td>      <td>Manhattan | SoHo</td>    </tr>    <tr>      <th>4</th>      <td>2012-07-21 20:34:00</td>      <td>2012-07-21 20:39:00</td>      <td>211</td>      <td>114</td>      <td>Manhattan | SoHo</td>      <td>Manhattan | Greenwich Village South</td>    </tr>    <tr>      <th>5</th>      <td>2013-05-31 17:45:06</td>      <td>2013-05-31 18:41:07</td>      <td>138</td>      <td>25</td>      <td>Queens | LaGuardia Airport</td>      <td>Brooklyn | Boerum Hill</td>    </tr>    <tr>      <th>6</th>      <td>2014-08-08 09:06:15</td>      <td>2014-08-08 09:14:52</td>      <td>230</td>      <td>100</td>      <td>Manhattan | Times Sq/Theatre District</td>      <td>Manhattan | Garment District</td>    </tr>    <tr>      <th>7</th>      <td>2015-03-28 05:51:31</td>      <td>2015-03-28 06:00:05</td>      <td>263</td>      <td>107</td>      <td>Manhattan | Yorkville West</td>      <td>Manhattan | Gramercy</td>    </tr>    <tr>      <th>8</th>      <td>2015-10-11 10:20:04</td>      <td>2015-10-11 10:31:45</td>      <td>75</td>      <td>194</td>      <td>Manhattan | East Harlem South</td>      <td>Manhattan | Randalls Island</td>    </tr>    <tr>      <th>9</th>      <td>2016-07-15 00:57:44</td>      <td>2016-07-15 01:03:16</td>      <td>125</td>      <td>186</td>      <td>Manhattan | Hudson Sq</td>      <td>Manhattan | Penn Station/Madison Sq West</td>    </tr>  </tbody></table>'

In [10]:

from IPython.display import HTML
HTML(df3.head(10).sort_values('pickup_datetime').reset_index(drop=True).to_html())

Out[10]:

	pickup_datetime	dropoff_datetime	pickup_taxizone_id	dropoff_taxizone_id	pickup_location	dropoff_location
0	2009-04-02 15:02:43	2009-04-02 15:22:45	138	229	Queens \| LaGuardia Airport	Manhattan \| Sutton Place/Turtle Bay North
1	2009-05-03 01:34:00	2009-05-03 01:36:00	79	164	Manhattan \| East Village	Manhattan \| Midtown South
2	2010-10-09 00:48:38	2010-10-09 01:00:55	79	237	Manhattan \| East Village	Manhattan \| Upper East Side South
3	2011-09-07 10:03:00	2011-09-07 10:09:00	113	211	Manhattan \| Greenwich Village North	Manhattan \| SoHo
4	2012-07-21 20:34:00	2012-07-21 20:39:00	211	114	Manhattan \| SoHo	Manhattan \| Greenwich Village South
5	2013-05-31 17:45:06	2013-05-31 18:41:07	138	25	Queens \| LaGuardia Airport	Brooklyn \| Boerum Hill
6	2014-08-08 09:06:15	2014-08-08 09:14:52	230	100	Manhattan \| Times Sq/Theatre District	Manhattan \| Garment District
7	2015-03-28 05:51:31	2015-03-28 06:00:05	263	107	Manhattan \| Yorkville West	Manhattan \| Gramercy
8	2015-10-11 10:20:04	2015-10-11 10:31:45	75	194	Manhattan \| East Harlem South	Manhattan \| Randalls Island
9	2016-07-15 00:57:44	2016-07-15 01:03:16	125	186	Manhattan \| Hudson Sq	Manhattan \| Penn Station/Madison Sq West

Section 2: Calculate Count Matrix for Full Dataset¶

In [11]:

df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'dropoff_taxizone_id'])
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)

/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [12]:

df.head()

Out[12]:

	pickup_taxizone_id	dropoff_taxizone_id
pickup_datetime
2009-01-01 00:00:00	237	263
2009-01-01 00:00:00	114	249
2009-01-01 00:00:02	237	43
2009-01-01 00:00:04	261	261
2009-01-01 00:00:07	144	80

In [13]:

count_dataframe = df.reset_index().groupby(['pickup_taxizone_id', 'dropoff_taxizone_id']).count().compute()
count_dataframe.columns = ['count']
count_dataframe.shape

Out[13]:

(60710, 1)

In [14]:

count_dataframe.head()

Out[14]:

		count
pickup_taxizone_id	dropoff_taxizone_id
1	1	101493
	186	35
	266	37311
2	2	4297
2	14	20

In [15]:

count_matrix = np.zeros((267, 267), dtype=np.int64)
for r in count_dataframe.reset_index().itertuples():
    count_matrix[r[1], r[2]] = r[3]

In [16]:

count_dataframe.describe()

Out[16]:

	count
count	6.071000e+04
mean	2.277015e+04
std	1.698617e+05
min	1.000000e+00
25%	9.000000e+00
50%	8.800000e+01
75%	1.258000e+03
max	2.556999e+07

In [17]:

count_dataframe.reset_index().head()

Out[17]:

	pickup_taxizone_id	dropoff_taxizone_id	count
0	1	1	101493
1	1	186	35
2	1	266	37311
3	2	2	4297
4	2	14	20

In [18]:

# <!-- collapse=True -->
plt.imshow(count_matrix[1:-3, 1:-3].T, norm=symlog(10000), origin='upper', cmap=plt.cm.Blues)
plt.grid(False)
plt.xlabel("Dropoff Taxi Zone ID")
plt.ylabel("Pickup Taxi Zone ID")
plt.gcf().set_size_inches(4, 4)

Section 3: Calculate Pickups matrix¶

In [19]:

df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['pickup_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['pickup_taxizone_id'] = df.pickup_taxizone_id.fillna(266.).astype(np.int32)

/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [20]:

def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')

In [21]:

df['pickup_ymd'] = df.map_partitions(get_year_mo_day, 'pickup_datetime', meta=('asdf', np.datetime64))

/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)

In [22]:

df.reset_index().rename(columns=dict(index='N')).tail()

Out[22]:

	N	pickup_taxizone_id	pickup_ymd
1432504	2016-12-31 23:59:57	36	2016-12-31 23:00:00
1432505	2016-12-31 23:59:58	76	2016-12-31 23:00:00
1432506	2016-12-31 23:59:58	168	2016-12-31 23:00:00
1432507	2016-12-31 23:59:58	144	2016-12-31 23:00:00
1432508	2016-12-31 23:59:59	135	2016-12-31 23:00:00

In [23]:

pickup_counts_df = df.reset_index().rename(columns=dict(index='N')).groupby(['pickup_taxizone_id', 'pickup_ymd',]).count().compute()
pickup_counts_df.sort_index(inplace=True)

In [24]:

pickup_counts_df.head()

Out[24]:

		N
pickup_taxizone_id	pickup_ymd
1	2009-01-01 01:00:00	1
	2009-01-01 02:00:00	1
	2009-01-01 04:00:00	2
	2009-01-01 05:00:00	1
	2009-01-01 07:00:00	1

In [25]:

z = pickup_counts_df.unstack(0)

In [26]:

z.columns = np.arange(1, 267).astype(str)

In [27]:

z = z.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)

In [28]:

z.head()

Out[28]:

	1	4	7	8	9	12	13	14	16	17	18	19	20	22	23	24	25	26	27	28	29	32	33	34	35	36	37	38	40	41	42	43	45	47	48	49	50	...	217	219	220	223	224	225	226	227	228	229	230	231	232	233	234	235	236	237	238	239	241	242	243	244	246	247	248	249	250	251	252	255	256	257	258	260	261	262	263	266
2009-01-01 00:00:00	0	121	42	0	0	4	63	2	0	13	1	1	0	1	3	78	28	0	0	3	0	1	29	1	0	3	16	0	27	53	30	207	21	0	594	18	178	...	2	0	0	18	71	5	35	1	1	379	57	328	70	211	544	1	384	459	407	439	3	0	10	32	211	1	2	409	0	0	0	74	53	4	2	6	70	179	449	369
2009-01-01 01:00:00	1	137	88	0	0	4	63	2	1	11	1	0	0	0	5	84	51	0	1	0	0	0	78	2	0	8	22	0	37	79	41	103	39	0	440	37	184	...	7	0	2	35	121	10	68	0	8	540	130	360	131	298	402	1	382	459	486	549	1	4	24	64	167	3	0	322	1	0	0	97	92	7	0	22	85	239	627	452
2009-01-01 02:00:00	1	125	115	0	0	2	58	4	2	19	1	0	0	0	4	90	54	1	0	1	0	0	54	0	0	4	15	0	54	99	50	121	47	0	486	50	130	...	11	1	3	40	119	6	69	1	10	496	155	348	115	288	295	1	329	267	401	491	1	0	28	71	117	4	0	335	0	0	1	122	115	6	1	21	55	170	594	378
2009-01-01 03:00:00	0	124	110	1	1	1	33	8	3	18	0	0	0	2	3	51	61	0	0	1	1	0	55	1	1	8	22	1	35	67	44	67	40	0	541	42	129	...	9	0	5	34	74	4	53	2	9	349	303	322	80	144	348	1	220	169	199	339	0	1	15	61	115	2	0	261	0	1	0	80	98	5	1	28	70	96	437	313
2009-01-01 04:00:00	2	100	105	0	1	0	19	7	4	8	2	3	1	1	0	41	43	1	0	1	0	0	27	3	0	7	24	0	20	54	28	34	38	1	390	27	120	...	3	0	5	29	34	3	59	1	7	233	308	221	53	116	286	3	91	79	112	186	0	0	10	27	69	6	1	241	0	0	0	80	74	5	0	20	37	62	296	224

5 rows × 266 columns

In [29]:

import fastparquet
fastparquet.write('/data/trips_pickups_matrix.parquet', z, compression='SNAPPY')

Section 4: Calculate Dropoffs matrix¶

In [30]:

df = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime',
                     columns=['dropoff_datetime', 'dropoff_taxizone_id', 'trip_type'])
df = df[df.trip_type != 'uber']
df = df.drop('trip_type', axis=1)
df['dropoff_taxizone_id'] = df.dropoff_taxizone_id.fillna(266.).astype(np.int32)

/home/shekhar/anaconda3/lib/python3.6/site-packages/fastparquet/api.py:436: UserWarning: Regression warning: found category spec from fastparquet <= 0.0.6
  warnings.warn('Regression warning: found category spec from '

In [31]:

def get_year_mo_day(data, col):
#     d = np.core.defchararray.replace(np.core.defchararray.add(data.index.values.astype('M8[h]').astype(np.str), ":00"), 'T', ' ')
#     return d
    return data.index.values.astype('M8[h]')

In [32]:

df['dropoff_ymd'] = df.map_partitions(get_year_mo_day, 'dropoff_datetime', meta=('asdf', np.datetime64))

/home/shekhar/anaconda3/lib/python3.6/site-packages/dask/dataframe/utils.py:232: FutureWarning: Passing in 'datetime64' dtype with no frequency is deprecated and will raise in a future version. Please pass in 'datetime64[ns]' instead.
  return pd.Series([], dtype=dtype, name=name, index=index)

In [33]:

df.reset_index(drop=True).tail()

Out[33]:

	dropoff_datetime	dropoff_taxizone_id	dropoff_ymd
1432504	2017-01-01 00:07:47	36	2016-12-31 23:00:00
1432505	2017-01-01 00:15:29	63	2016-12-31 23:00:00
1432506	2017-01-01 00:39:07	161	2016-12-31 23:00:00
1432507	2017-01-01 00:03:50	209	2016-12-31 23:00:00
1432508	2017-01-01 00:14:30	134	2016-12-31 23:00:00

In [34]:

dropoff_counts_df = df.reset_index(drop=True).rename(columns=dict(dropoff_datetime='N')).groupby(['dropoff_taxizone_id', 'dropoff_ymd',]).count().compute()
dropoff_counts_df.sort_index(inplace=True)

In [35]:

dropoff_counts_df.head()

Out[35]:

		N
dropoff_taxizone_id	dropoff_ymd
1	2009-01-01 01:00:00	2
	2009-01-01 02:00:00	3
	2009-01-01 03:00:00	1
	2009-01-01 04:00:00	10
	2009-01-01 05:00:00	10

In [36]:

z2 = dropoff_counts_df.unstack(0)

In [37]:

z2 = z2.merge(
    pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2016-12-31 23:00:00', freq='H')), 
    how='right', left_index=True, right_index=True).fillna(0).astype(np.int32)

/home/shekhar/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right)
  warnings.warn(msg, UserWarning)

In [38]:

z2.columns = np.arange(1, 267).astype(str)

In [39]:

z2.head()

Out[39]:

	1	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	45	46	47	48	49	50	...	217	218	219	220	221	222	223	224	225	226	227	228	229	230	231	232	233	234	235	236	237	238	239	240	241	242	243	244	245	246	247	248	249	250	251	252	253	254	255	256	257	258	259	260	261	262	263	266
2009-01-01 00:00:00	0	1	128	0	1	113	0	0	1	1	1	111	20	1	8	28	3	1	6	2	1	4	84	36	5	1	6	2	0	0	0	57	4	1	10	23	1	0	32	95	81	95	30	0	0	434	44	150	...	7	3	1	9	0	0	53	111	16	56	9	17	352	50	281	75	291	366	3	432	253	407	404	1	10	2	43	82	0	188	0	3	306	3	0	2	0	0	85	75	13	6	3	36	111	282	450	379
2009-01-01 01:00:00	2	2	154	0	1	188	2	1	0	3	1	87	25	4	5	28	4	1	1	2	4	5	96	47	4	1	3	0	0	1	1	104	5	3	8	25	0	0	37	140	91	97	39	1	2	402	49	147	...	10	1	2	24	0	0	89	161	22	94	9	23	372	118	226	96	335	331	8	395	227	434	413	0	9	5	86	106	1	157	2	2	340	2	0	5	0	1	103	84	22	7	7	41	75	346	627	469
2009-01-01 02:00:00	3	1	134	1	0	245	0	4	4	3	3	88	27	3	10	41	7	1	7	3	7	4	89	45	5	0	7	2	0	1	3	85	1	3	15	30	2	0	31	150	103	77	47	0	1	414	65	121	...	10	0	1	15	1	0	105	145	26	103	11	24	285	162	210	84	301	243	2	390	160	358	370	0	4	2	71	118	1	115	12	2	283	1	0	3	1	0	92	98	28	3	4	53	66	283	536	408
2009-01-01 03:00:00	1	1	109	0	0	229	0	4	1	3	1	69	40	2	7	31	7	3	9	5	13	6	86	55	3	0	9	2	1	1	1	69	2	1	21	46	0	0	33	113	100	50	33	0	3	368	69	143	...	9	0	1	19	0	1	107	113	16	84	13	33	245	197	171	85	215	204	3	267	136	245	251	0	6	1	71	105	0	90	5	2	241	0	1	5	0	2	59	81	24	8	5	56	89	233	448	334
2009-01-01 04:00:00	10	2	103	0	0	221	1	3	2	2	1	50	38	1	5	32	4	2	2	4	6	1	56	46	2	0	9	1	0	2	2	46	2	3	23	56	1	1	25	97	79	23	28	0	0	258	48	98	...	7	1	0	20	0	0	98	68	24	82	8	26	143	157	96	72	141	110	8	122	71	198	169	3	8	1	52	87	1	39	6	1	171	3	0	4	0	3	62	84	20	4	6	52	68	143	291	246

5 rows × 266 columns

In [40]:

import fastparquet
fastparquet.write('/data/trips_dropoffs_matrix.parquet', z2, compression='SNAPPY')

Section 5 : Perform PCA¶

In [86]:

tzdf = geopandas.read_file('../shapefiles/taxi_zones.shp')

In [87]:

import fastparquet
dropoffs_matrix = fastparquet.ParquetFile('/data/trips_dropoffs_matrix.parquet').to_pandas()
pickups_matrix = fastparquet.ParquetFile('/data/trips_pickups_matrix.parquet').to_pandas()

In [88]:

dropoffs_matrix = dropoffs_matrix.iloc[:, :-3]
pickups_matrix = pickups_matrix.iloc[:, :-3]

In [135]:

counts_matrix = pd.concat([dropoffs_matrix, pickups_matrix], axis=1 )

In [321]:

tzdf.zone[0]

Out[321]:

'Newark Airport'

In [324]:

sns.distplot(counts_matrix.iloc[:, 263+0], kde=False)
sns.distplot(counts_matrix.iloc[:, 0], kde=False)

Out[324]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f049a3dc710>

In [136]:

import sklearn, sklearn.decomposition

In [414]:

# pca = sklearn.decomposition.PCA(n_components=20, whiten=True)
# # pca.fit(counts_matrix.resample('1D').sum().values)
# pca.fit(counts_matrix.values)
# pca.explained_variance_ratio_

In [500]:

pca = sklearn.decomposition.FastICA(n_components=3, random_state=42, whiten=True)
# pca.fit(counts_matrix.resample('1D').sum().values)
yvals = pca.fit_transform(counts_matrix.values)
# pca.explained_variance_ratio_

/home/shekhar/anaconda3/lib/python3.6/site-packages/sklearn/decomposition/fastica_.py:116: UserWarning: FastICA did not converge. Consider increasing tolerance or the maximum number of iterations.
  warnings.warn('FastICA did not converge. Consider increasing '

In [501]:

yvals.shape

Out[501]:

(70128, 3)

In [502]:

pickup_eof1, dropoff_eof1 = pca.components_[0, :263], pca.components_[0, 263:]
pickup_eof2, dropoff_eof2 = pca.components_[1, :263], pca.components_[1, 263:]
pickup_eof3, dropoff_eof3 = pca.components_[2, :263], pca.components_[2, 263:]
# pickup_eof4, dropoff_eof4 = pca.components_[3, :263], pca.components_[3, 263:]
# pickup_eof5, dropoff_eof5 = pca.components_[4, :263], pca.components_[4, 263:]

In [503]:

tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
# tzdf['pEOF4'] = pickup_eof4
# tzdf['dEOF4'] = dropoff_eof4
# tzdf['pEOF5'] = pickup_eof5
# tzdf['dEOF5'] = dropoff_eof5

In [504]:

tzdf['N_dropoffs'] = dropoffs_matrix.sum(axis=0).values
tzdf['N_pickups'] = pickups_matrix.sum(axis=0).values

In [505]:

tzdf['log10_N_dropoffs'] = np.log10(tzdf.N_dropoffs)
tzdf['log10_N_pickups'] = np.log10(tzdf.N_pickups)

In [506]:

tzdf = tzdf.to_crs({'init': 'epsg:3857'})

In [507]:

tzdf.head()

Out[507]:

	LocationID	OBJECTID	Shape_Area	Shape_Leng	borough	geometry	zone	N_dropoffs	N_pickups	log10_N_dropoffs	log10_N_pickups	N_dropoffs_ranked	N_pickups_ranked	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
0	1	1	0.000782	0.116357	EWR	POLYGON ((-8258175.532737531 4967457.202992616...	Newark Airport	1801889	109695	6.255728	5.040187	0.667939	0.469466	-2.717661e-08	-2.881359e-09	-1.794189e-08	-1.433908e-09	5.950944e-08	3.195680e-09	-8.254247e-08	-5.718085e-10	-1.040261e-07	-3.076457e-09
1	2	2	0.004866	0.433470	Queens	(POLYGON ((-8217980.621910957 4959237.28547167...	Jamaica Bay	10387	11702	4.016490	4.068260	0.053435	0.164122	1.004834e-10	7.612880e-11	-4.304754e-11	-6.442688e-11	-1.523368e-10	-1.357631e-10	3.737542e-10	5.648617e-10	3.248668e-10	4.355836e-10
2	3	3	0.000314	0.084341	Bronx	POLYGON ((-8220713.534155379 4993383.154018582...	Allerton/Pelham Gardens	96883	16547	4.986248	4.218719	0.213740	0.202290	-4.847524e-11	4.417117e-11	1.598215e-10	-4.446037e-11	-2.216636e-09	-5.661792e-11	-5.797356e-09	-8.936931e-10	-2.671187e-09	-4.799690e-10
3	4	4	0.000112	0.043567	Manhattan	POLYGON ((-8234500.226961648 4971984.093397928...	Alphabet City	6643997	4752651	6.822429	6.676936	0.812977	0.805344	-7.417994e-08	2.380392e-07	-2.521742e-08	6.128409e-08	-1.776279e-07	-2.114199e-07	3.549393e-08	-5.165418e-07	1.502940e-07	-1.212258e-07
4	5	5	0.000498	0.092146	Staten Island	POLYGON ((-8257036.10884249 4948033.094989423,...	Arden Heights	7340	1025	3.865696	3.010724	0.034351	0.015267	-4.787781e-11	7.245953e-12	6.745541e-12	-1.782181e-12	-2.716417e-10	-1.632698e-11	-1.791913e-11	-2.047445e-11	3.142611e-11	-1.017828e-11

In [526]:

tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [527]:

tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')

In [528]:

tzdf2.plot(figsize=(12, 18), alpha=1, column='N_dropoffs_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

In [529]:

tzdf2.plot(figsize=(12, 18), alpha=1, column='N_pickups_ranked', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5)
ax = plt.gca()
plt.grid(False)
ax.set_facecolor('k')
ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)

In [530]:

tzdf2.iloc[:, -10:].describe()

Out[530]:

	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
count	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02	2.420000e+02
mean	1.337247e-08	1.323585e-08	-3.038017e-08	-3.044156e-08	-3.483880e-08	-3.482810e-08	-4.837575e-08	-4.892726e-08	-3.441977e-08	-3.494932e-08
std	3.221157e-07	2.155272e-07	1.011613e-07	1.191920e-07	1.461272e-07	2.128391e-07	2.563503e-07	4.767413e-07	2.825250e-07	2.698045e-07
min	-1.041723e-06	-1.178042e-06	-6.680994e-07	-7.350545e-07	-1.130895e-06	-1.927351e-06	-1.568877e-06	-4.064235e-06	-2.427254e-06	-1.581324e-06
25%	-5.394186e-09	-5.953277e-10	-3.959468e-09	-3.293448e-09	-5.268074e-08	-1.307326e-08	-6.383727e-08	-2.630257e-08	-2.527213e-08	-7.778206e-09
50%	-3.556521e-10	1.076115e-10	-7.536052e-11	-9.318704e-11	-5.885055e-09	-2.705909e-10	-1.180033e-08	-2.135500e-09	-4.541858e-09	-5.239243e-10
75%	4.593931e-09	6.834840e-09	9.943823e-10	5.473768e-11	-1.525962e-09	4.124035e-12	-4.379473e-10	2.358084e-11	-1.293940e-10	3.026373e-10
max	3.253512e-06	1.252968e-06	2.057265e-07	3.911548e-07	8.292906e-07	7.651686e-07	1.753640e-06	1.882881e-06	1.445455e-06	1.378802e-06

In [531]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 8), alpha=1, column='pEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [537]:

import pysal.esda.mapclassify

In [544]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [533]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.RdBu, edgecolor='k',
          linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [516]:

# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)

In [517]:

# ax1 = plt.subplot(121)
# tzdf2.plot(figsize=(18, 12), alpha=1, column='pEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax1)
# plt.grid(False)
# ax1.set_facecolor('xkcd:silver')
# ax1.xaxis.set_visible(False)
# ax1.yaxis.set_visible(False)

# ax2 = plt.subplot(122)
# tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.RdBu, edgecolor='k',
#           linewidth=0.5, vmin=-0.2e-5, vmax=0.2e-5, ax=ax2)
# plt.grid(False)
# ax2.set_facecolor('xkcd:silver')
# ax2.xaxis.set_visible(False)
# ax2.yaxis.set_visible(False)

In [520]:

df4 = pd.DataFrame(data=pca.transform(counts_matrix.values)[:, :3], index=counts_matrix.index)
df4.index = df4.index.rename('timepoints')
df4.rename(columns={i:'pc%d' % (i+1) for i in range(3)}, inplace=True)
# df4.reset_index(inplace=True)

In [534]:

df4.plot(lw=1)
plt.xlim('2015-06-22', '2015-06-29')
plt.ylim(-0.02, 0.01)

Out[534]:

(-0.02, 0.01)

In [522]:

df4.plot(lw=0.5)
# plt.xlim('2015-06-22', '2015-06-29')

Out[522]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f04897ae518>

In [524]:

df4.resample('1M').mean().plot()
df4.resample('1M').std().plot()
# plt.xlim('2015-06-22', '2015-06-29')

Out[524]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f04c4441c88>

In [402]:

# df4 = pd.DataFrame(data=pca.transform(counts_matrix.resample('1D').sum().values)[:, :5], index=counts_matrix.resample('1D').sum().index)
# df4.index = df4.index.rename('timepoints')
# df4.rename(columns={i:'pc%d' % i for i in range(5)}, inplace=True)
# # df4.reset_index(inplace=True)

In [403]:

df4.plot()
plt.xlim('2014-04-01', '2014-09-01')

Out[403]:

(16161, 16314)

In [222]:

nmf = sklearn.decomposition.NMF(5, random_state=42)

In [223]:

nmf.fit(counts_matrix.resample('1D').sum().values)
# nmf.explained_variance_ratio_

Out[223]:

NMF(alpha=0.0, beta=1, eta=0.1, init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, nls_max_iter=2000, random_state=42, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [224]:

nmf.reconstruction_err_

Out[224]:

291686.29896657454

In [225]:

pickup_eof1, dropoff_eof1 = nmf.components_[0, :263], nmf.components_[0, 263:]
pickup_eof2, dropoff_eof2 = nmf.components_[1, :263], nmf.components_[1, 263:]
pickup_eof3, dropoff_eof3 = nmf.components_[2, :263], nmf.components_[2, 263:]
pickup_eof4, dropoff_eof4 = nmf.components_[3, :263], nmf.components_[3, 263:]
pickup_eof5, dropoff_eof5 = nmf.components_[4, :263], nmf.components_[4, 263:]

In [226]:

tzdf['pEOF1'] = pickup_eof1
tzdf['dEOF1'] = dropoff_eof1
tzdf['pEOF2'] = pickup_eof2
tzdf['dEOF2'] = dropoff_eof2
tzdf['pEOF3'] = pickup_eof3
tzdf['dEOF3'] = dropoff_eof3
tzdf['pEOF4'] = pickup_eof4
tzdf['dEOF4'] = dropoff_eof4
tzdf['pEOF5'] = pickup_eof5
tzdf['dEOF5'] = dropoff_eof5

In [227]:

tzdf2 = tzdf.copy()
tzdf2 = tzdf2[(tzdf2.borough != 'Staten Island') & (tzdf2.borough != 'EWR')]

In [228]:

tzdf2 = tzdf2.sort_values('N_dropoffs')
tzdf2['N_dropoffs_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('N_pickups')
tzdf2['N_pickups_ranked'] = np.linspace(0, 1., tzdf2.shape[0])
tzdf2 = tzdf2.sort_values('LocationID')

In [235]:

tzdf2.iloc[:, -10:].describe()

Out[235]:

	pEOF1	dEOF1	pEOF2	dEOF2	pEOF3	dEOF3	pEOF4	dEOF4	pEOF5	dEOF5
count	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000	242.000000
mean	33.614941	33.744037	19.446174	19.522582	22.824028	23.029265	12.387393	12.510342	9.107524	8.914849
std	73.643847	77.649309	39.660845	46.353030	30.361697	36.030383	31.689024	33.863347	18.863313	27.253940
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.284592	0.012276	0.344686	0.011539	2.180188	0.232518	0.363029	0.000000	0.284869	0.000000
50%	1.089898	0.155107	1.118604	0.161643	8.617385	4.500133	0.918350	0.077347	1.153524	0.105230
75%	16.729933	7.733491	16.267112	10.980080	33.205927	29.741598	5.186028	2.094543	7.727718	3.108362
max	431.663740	436.021587	292.790759	390.687009	141.827463	168.425644	243.430495	213.360773	110.659220	257.589822

In [236]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF1', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=430., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [240]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF2', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=292., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [241]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF3', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=168., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [243]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF4', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=113., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [244]:

ax1 = plt.subplot(121)
tzdf2.plot(figsize=(12, 18), alpha=1, column='pEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax1)
plt.grid(False)
ax1.set_facecolor('xkcd:silver')
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)

ax2 = plt.subplot(122)
tzdf2.plot(figsize=(12, 18), alpha=1, column='dEOF5', cmap=plt.cm.viridis, edgecolor='k',
          linewidth=0.5, vmin=0, vmax=257., ax=ax2)
plt.grid(False)
ax2.set_facecolor('xkcd:silver')
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)

In [259]:

df4 = pd.DataFrame(data=nmf.transform(counts_matrix.resample('1D').sum().values), index=counts_matrix.resample('1D').sum().index)

In [272]:

df5 = df4.reset_index()
df5 = df5.rename(columns={'index':'d', 0: 'pc1', 1: 'pc2', 2:'pc3', 3:'pc4', 4:'pc5'})

In [273]:

import plotnine as p9

In [311]:

(p9.ggplot(df5, p9.aes('d', 'pc1')) + p9.geom_point(color='steelblue', size=.2)) + p9.stat_smooth(
    method='lm',size=1)

Out[311]:

<ggplot: (8728644232519)>

In [300]:

(p9.ggplot(df5, p9.aes('d', 'pc2')) + p9.geom_point()) + p9.stat_smooth(method='lowess')

/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"

Out[300]:

<ggplot: (8728654057252)>

In [294]:

(p9.ggplot(df5, p9.aes('d', 'pc3')) + p9.geom_point()) + p9.stat_smooth(method='lowess')

/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"

Out[294]:

<ggplot: (8728655259824)>

In [295]:

(p9.ggplot(df5, p9.aes('d', 'pc4')) + p9.geom_point()) + p9.stat_smooth(method='lowess')

/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"

Out[295]:

<ggplot: (-9223363308200697137)>

In [296]:

(p9.ggplot(df5, p9.aes('d', 'pc5')) + p9.geom_point()) + p9.stat_smooth(method='lowess')

/home/shekhar/anaconda3/lib/python3.6/site-packages/plotnine/stats/smoothers.py:150: UserWarning: Confidence intervals are not yet implementedfor lowess smoothings.
  warnings.warn("Confidence intervals are not yet implemented"

Out[296]:

<ggplot: (-9223363308201012066)>

In [411]:

dir(sklearn.decomposition)

Out[411]:

['DictionaryLearning',
 'FactorAnalysis',
 'FastICA',
 'IncrementalPCA',
 'KernelPCA',
 'LatentDirichletAllocation',
 'MiniBatchDictionaryLearning',
 'MiniBatchSparsePCA',
 'NMF',
 'PCA',
 'ProjectedGradientNMF',
 'RandomizedPCA',
 'SparseCoder',
 'SparsePCA',
 'TruncatedSVD',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_online_lda',
 'base',
 'cdnmf_fast',
 'dict_learning',
 'dict_learning_online',
 'factor_analysis',
 'fastica',
 'fastica_',
 'incremental_pca',
 'kernel_pca',
 'nmf',
 'non_negative_factorization',
 'online_lda',
 'pca',
 'randomized_svd',
 'sparse_encode',
 'sparse_pca',
 'truncated_svd']

In [ ]: