In [1]:

import dask.dataframe as dd
import dask.distributed
import numpy as np

In [2]:

client = dask.distributed.Client()

In [4]:

# Print column names

df0 = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime')
df0.columns

Out[4]:

Index(['dropoff_datetime', 'dropoff_latitude', 'dropoff_longitude',
       'dropoff_taxizone_id', 'ehail_fee', 'extra', 'fare_amount',
       'improvement_surcharge', 'mta_tax', 'passenger_count', 'payment_type',
       'pickup_latitude', 'pickup_longitude', 'pickup_taxizone_id',
       'rate_code_id', 'store_and_fwd_flag', 'tip_amount', 'tolls_amount',
       'total_amount', 'trip_distance', 'trip_type', 'vendor_id', 'trip_id'],
      dtype='object')

In [21]:

# Load only the columns we need, large speedup.

df = dd.read_parquet('/data/all_trips_spark.parquet', engine='arrow', 
                     columns=[
#                         'pickup_datetime', 
        'pickup_longitude', 'pickup_latitude', #'pickup_taxizone_id',
#                         'dropoff_datetime', 
        'dropoff_longitude', 'dropoff_latitude', #'dropoff_taxizone_id',
#                         'trip_type', 
#         'passenger_count'
        'total_amount'
                    ])

In [22]:

df.head()

Out[22]:

	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	total_amount
0	-73.965919	40.771244	-73.949608	40.777058	5.800000
1	-73.997482	40.725952	-74.005936	40.735703	5.400000
2	-73.964798	40.767391	-73.977753	40.773746	5.800000
3	-74.011597	40.708832	-74.013466	40.709358	4.600000
4	-74.000648	40.718578	-73.944580	40.712368	27.799999

In [23]:

df.tail()

Out[23]:

	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	total_amount
1432504	NaN	NaN	NaN	NaN	8.300000
1432505	NaN	NaN	NaN	NaN	17.299999
1432506	NaN	NaN	NaN	NaN	41.759998
1432507	NaN	NaN	NaN	NaN	6.300000
1432508	NaN	NaN	NaN	NaN	14.300000

In [24]:

#Select only those points within some reasonable bounds (half a degree)

df = df[df.pickup_latitude.notnull() & df.pickup_longitude.notnull() 
        & ((df.pickup_latitude - 40.75).abs() <= 0.5) 
        & ((df.pickup_longitude + 73.9).abs() <= 0.5)
       ]
df = df[df.dropoff_latitude.notnull() & df.dropoff_longitude.notnull() 
        & ((df.dropoff_latitude - 40.75).abs() <= 0.5) 
        & ((df.dropoff_longitude + 73.9).abs() <= 0.5)
       ]

In [25]:

# We get about 1.27 billion points
df.count().compute()

Out[25]:

pickup_longitude     1268170371
pickup_latitude      1268170371
dropoff_longitude    1268170371
dropoff_latitude     1268170371
total_amount         1268170371
dtype: int64

In [26]:

df.head()

Out[26]:

	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	total_amount
0	-73.965919	40.771244	-73.949608	40.777058	5.800000
1	-73.997482	40.725952	-74.005936	40.735703	5.400000
2	-73.964798	40.767391	-73.977753	40.773746	5.800000
3	-74.011597	40.708832	-74.013466	40.709358	4.600000
4	-74.000648	40.718578	-73.944580	40.712368	27.799999

In [27]:

def convert_lon(d, latvar):
    "Convert longitude to web mercator"
    k = d[latvar].copy()
    k = (20037508.34 / 180) * (np.log(np.tan((90. + d[latvar]) * np.pi/360))/(np.pi/180.))
    return k

In [28]:

# Convert lats and lons to web mercator projection
df['pickup_longitude'] = df.pickup_longitude * (20037508.34 / 180)
df['pickup_latitude'] = df.map_partitions(convert_lon, 'pickup_latitude')
df['dropoff_longitude'] = df.dropoff_longitude * (20037508.34 / 180)
df['dropoff_latitude'] = df.map_partitions(convert_lon, 'dropoff_latitude')

In [29]:

# Consolidate partitions for faster plotting
df.repartition(npartitions=200).to_parquet('/tmp/filtered.parquet', compression='SNAPPY')

In [30]:

# Read the consolidated data back in
df = dd.read_parquet('/tmp/filtered.parquet')

In [31]:

# Subsample the data 
# It's currently commented out, but it's useful 
# when iterating on plot details (axes, ranges, etc.), 
# as it greatly speeds up plot redrawing. 

# df = client.persist(df.sample(frac=0.02))

In [32]:

df.head()

Out[32]:

	pickup_longitude	pickup_latitude	dropoff_longitude	dropoff_latitude	total_amount
0	-8233848.5	4978660.0	-8232033.0	4979513.0	5.800000
1	-8237362.0	4972004.0	-8238303.0	4973436.5	5.400000
2	-8233724.0	4978093.5	-8235166.0	4979025.5	5.800000
3	-8238933.5	4969490.0	-8239141.5	4969566.0	4.600000
4	-8237714.5	4970922.5	-8231473.0	4970009.0	27.799999

In [61]:

import datashader as ds
import datashader.transfer_functions as tf

import datashader as ds
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

In [62]:

from bokeh.models import BoxZoomTool
from bokeh.plotting import figure, output_notebook, show

output_notebook()

#set centers, bounds, and ranges in web mercator coords
x_center = -8234000 
y_center = 4973000

x_half_range = 30000
y_half_range = 25000

NYC = x_range, y_range = ((x_center - x_half_range, x_center + x_half_range), 
                          (y_center-y_half_range, y_center+y_half_range))

# plot_width scales (quadratically?) with memory consumption.
# With 32GB RAM, I can set this to 2000, but 2500 crashes with MemoryError.
# I used this setting for high quality, large plots. 
# plot_width = 2000 

# plot_width of 400 seems to require less than 4GB, and makes the notebook more manageable. 
# Also changes aesthetic appearance by decreasing GPS "noise" due to coarse binning
plot_width  = 400 

# auto calculate from width
plot_height = int(plot_width/(x_half_range/y_half_range))

def base_plot(tools='pan,wheel_zoom,reset,save',plot_width=plot_width, 
              plot_height=plot_height, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
        x_range=x_range, y_range=y_range, outline_line_color=None,
        min_border=0, min_border_left=0, min_border_right=0,
        min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.axis.visible = False
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    
    p.add_tools(BoxZoomTool(match_aspect=True))
    
    return p
    
options = dict(line_color=None, fill_color='blue', size=5)

Loading BokehJS ...

Pickups¶

In [40]:

background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))


def create_image_1(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'pickup_longitude', 'pickup_latitude', ds.count('total_amount'))
    img = tf.shade(agg, cmap=viridis, how='eq_hist')
    return tf.dynspread(img, threshold=0.5, max_px=4)

In [41]:

p = base_plot(background_fill_color=background)
export(create_image_1(x_range, y_range, plot_width, plot_height),"pickups_large_wide")
InteractiveImage(p, create_image_1)

Out[41]:

Dropoffs¶

In [42]:

background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))


def create_image_2(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'dropoff_longitude', 'dropoff_latitude', ds.count('total_amount'))
    img = tf.shade(agg, cmap=inferno, how='eq_hist')
    return tf.dynspread(img, threshold=0.5, max_px=4)

In [43]:

p = base_plot(background_fill_color=background)
export(create_image_2(x_range, y_range, plot_width, plot_height),"dropoffs_large_wide")
InteractiveImage(p, create_image_2)

Out[43]:

Pickups (Green) vs Dropoffs (Orange)¶

In [44]:

background = 'black'
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))


def create_image_3(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    drops = cvs.points(df, 'dropoff_longitude', 'dropoff_latitude', ds.count('total_amount'))
    picks = cvs.points(df, 'pickup_longitude', 'pickup_latitude', ds.count('total_amount'))
    more_drops = tf.shade(drops.where(drops > picks), cmap=inferno, how='eq_hist')
    more_picks = tf.shade(picks.where(picks > drops), cmap=viridis,  how='eq_hist')
    img = tf.stack(more_picks,more_drops)
    return tf.dynspread(img, threshold=0.5, max_px=4)

In [45]:

p = base_plot(background_fill_color=background)
export(create_image_3(x_range, y_range, plot_width, plot_height),"pickups_dropoffs_large_wide")
InteractiveImage(p, create_image_3)

/home/shekhar/anaconda3/lib/python3.5/site-packages/toolz/functoolz.py:621: RuntimeWarning: invalid value encountered in over
  return func(b, a)

Out[45]:

Section 2, zoomed plots¶

In [46]:

x_center = -8234000 
y_center = 4975000

x_half_range = int(15000 / 1.5)
y_half_range = int(12500 / 1.5)

NYC = x_range, y_range = ((x_center - x_half_range, x_center + x_half_range), 
                          (y_center-y_half_range, y_center+y_half_range))

# plot_width scales (quadratically?) with memory consumption.
# With 32GB RAM, I can set this to 2000, but 2500 crashes with MemoryError.
# I used this setting for high quality, large plots. 
# plot_width = 2000 

# plot_width of 400 seems to require less than 4GB, and makes the notebook more manageable. 
# Also changes aesthetic appearance by decreasing GPS "noise" due to coarse binning
plot_width  = 400 


å
# auto calculate from width
plot_height = int(plot_width/(x_half_range/y_half_range))

background = 'black'
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))

In [47]:

p = base_plot(background_fill_color=background)
export(create_image_1(x_range, y_range, plot_width, plot_height),"pickups_large_close")
InteractiveImage(p, create_image_1)

Out[47]:

In [48]:

p = base_plot(background_fill_color=background)
export(create_image_2(x_range, y_range, plot_width, plot_height),"dropoffs_large_close")
InteractiveImage(p, create_image_2)

Out[48]:

In [49]:

p = base_plot(background_fill_color=background)
export(create_image_3(x_range, y_range, plot_width, plot_height),"pickups_dropoffs_large_close")
InteractiveImage(p, create_image_3)

/home/shekhar/anaconda3/lib/python3.5/site-packages/toolz/functoolz.py:621: RuntimeWarning: invalid value encountered in over
  return func(b, a)

Out[49]:

GPS Accuracy¶

In [41]:

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
import matplotlib
from matplotlib import rcParams

rcParams['savefig.dpi'] = 300

In [42]:

rcParams['font.sans-serif'] = ('Helvetica', 'Arial', 'Open Sans', 'Bitstream Vera Sans')
rcParams['font.size'] = 12
rcParams['font.stretch'] = 'normal'
rcParams['font.weight'] = 'normal'

import os.path
homedirpath = os.path.expanduser('~')
fontdirpath = ''
if '/Users/' in homedirpath:
    fontdirpath = os.path.join(homedirpath, 'Library/Fonts/')
else:
    fontdirpath = os.path.join(homedirpath, '.fonts/')
fontsize2 = 'size={0:0.1f}'.format(12)
rcParams['mathtext.it'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaOblique.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.rm'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.tt'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.bf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'HelveticaBold.ttf:' +
                            fontsize2
                            ).format(fontdirpath))
rcParams['mathtext.cal'] = ((':family=sans-serif:style=normal:variant='
                             'normal:weight=normal:stretch=normal:file='
                             '{0}/Helvetica.ttf:' +
                             fontsize2
                             ).format(fontdirpath))
rcParams['mathtext.sf'] = ((':family=sans-serif:style=normal:variant='
                            'normal:weight=normal:stretch=normal:file={0}/'
                            'Helvetica.ttf:' +
                            fontsize2
                            ).format(fontdirpath))

In [3]:

import pandas as pd

In [4]:

df = dd.read_parquet('/data/all_trips.parquet', index='pickup_datetime', columns=[
        'pickup_latitude', 'pickup_longitude', 'pickup_taxizone_id',
        'dropoff_latitude', 'dropoff_longitude', 'dropoff_taxizone_id',
    ], engine='fastparquet')

In [5]:

df.head()

Out[5]:

	pickup_latitude	pickup_longitude	pickup_taxizone_id	dropoff_latitude	dropoff_longitude	dropoff_taxizone_id
pickup_datetime
2009-01-01 00:00:00	40.771244	-73.965919	237.0	40.777058	-73.949608	263.0
2009-01-01 00:00:00	40.725952	-73.997482	114.0	40.735703	-74.005936	249.0
2009-01-01 00:00:02	40.767391	-73.964798	237.0	40.773746	-73.977753	43.0
2009-01-01 00:00:04	40.708832	-74.011597	261.0	40.709358	-74.013466	261.0
2009-01-01 00:00:07	40.718578	-74.000648	144.0	40.712368	-73.944580	80.0

In [6]:

df['pickup_datetime_str'] = df.index.astype(str)

In [7]:

df = df[df.pickup_datetime_str < '2016-07-01']

In [8]:

df['pickup_datetime_str'] = df.pickup_datetime_str.str.slice(0, 7)

In [9]:

df = df[df.pickup_latitude.notnull() & df.pickup_longitude.notnull() 
        & ((df.pickup_latitude - 40.75).abs() <= 0.5) 
        & ((df.pickup_longitude + 73.9).abs() <= 0.5)
       ]
df = df[df.dropoff_latitude.notnull() & df.dropoff_longitude.notnull() 
        & ((df.dropoff_latitude - 40.75).abs() <= 0.5) 
        & ((df.dropoff_longitude + 73.9).abs() <= 0.5)
       ]

In [10]:

z = df.groupby('pickup_datetime_str').count().compute()

In [14]:

z['pickup_valid_fraction'] = z['pickup_taxizone_id'].astype(np.float64) / z['pickup_longitude'].astype(np.float64)
z['dropoff_valid_fraction'] = z['dropoff_taxizone_id'].astype(np.float64) / z['dropoff_longitude'].astype(np.float64)

In [46]:

plt.plot(z.index.astype('M8[M]').values, 100*(1.0 - z.pickup_valid_fraction.values), 
          label='Pickups', lw=3)
plt.plot(z.index.astype('M8[M]').values, 100*(1.0 - z.dropoff_valid_fraction.values), 
          label='Dropoffs', lw=3)
plt.legend(loc='upper right')
plt.xlabel("Time")
plt.ylabel("GPS Error Rate [%]")
plt.title("GPS Coordinates Outside Taxi Zones")
plt.ylim(0, 1)
plt.xlim('2009-01', '2016-07')
plt.gcf().set_size_inches(6, 3)

In [ ]: