In this notebook we are showing how the taxi data was pre-processed prior to being used in the Dash web-application.
import vaex
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
The raw data of the New York Taxi dataset is curated by Taxi and Limousine Comission (TLC), and comes it CSV format. The data is somewhat dirty, and requires quite a lot of manual adjustments before it can be converted to the memory mappable HDF5 file format. As this is quite tedious, we have already done the conversion for the YellowCab taxi data between 2009 and 2015, and the data is available on S3.
We will select a single year of taxi data, for example 2012.
# Read in the data
df = vaex.open('/data/yellow_taxi_2009_2015_f32.hdf5')
# df = vaex.open('s3://vaex/taxi/yellow_taxi_2009_2015_f32.hdf5?anon=true')
# The data is sorted, so we can simply slite it
df = df[516794416:695338739+1] # take 2012
df.head_and_tail_print(3)
# | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | payment_type | trip_distance | pickup_longitude | pickup_latitude | rate_code | store_and_fwd_flag | dropoff_longitude | dropoff_latitude | fare_amount | surcharge | mta_tax | tip_amount | tolls_amount | total_amount |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CMT | 2012-01-10 23:55:50.000000000 | 2012-01-11 00:03:39.000000000 | 1 | CRD | 1.7000000476837158 | -73.99468994140625 | 40.725032806396484 | 1.0 | 0.0 | -73.9759521484375 | 40.73078155517578 | 6.900000095367432 | 0.5 | 0.5 | 1.0 | 0.0 | 8.899999618530273 |
1 | CMT | 2012-01-11 19:18:25.000000000 | 2012-01-11 19:26:10.000000000 | 1 | CSH | 1.100000023841858 | -73.98795318603516 | 40.75294876098633 | 1.0 | 0.0 | -73.9945297241211 | 40.76103973388672 | 6.099999904632568 | 1.0 | 0.5 | 0.0 | 0.0 | 7.599999904632568 |
2 | CMT | 2012-01-11 19:19:19.000000000 | 2012-01-11 19:48:15.000000000 | 2 | CRD | 18.0 | -73.78309631347656 | 40.6485481262207 | 2.0 | 0.0 | -73.99613189697266 | 40.747623443603516 | 45.0 | 0.0 | 0.5 | 10.0600004196167 | 4.800000190734863 | 60.36000061035156 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
178,544,321 | CMT | 2012-12-24 21:09:56.000000000 | 2012-12-24 21:13:22.000000000 | 1 | CSH | 0.30000001192092896 | -73.96232604980469 | 40.77641296386719 | 1.0 | 0.0 | -73.95834350585938 | 40.774906158447266 | 4.0 | 0.5 | 0.5 | 0.0 | 0.0 | 5.0 |
178,544,322 | CMT | 2012-12-24 20:07:23.000000000 | 2012-12-24 20:20:18.000000000 | 1 | CSH | 1.100000023841858 | -73.97209167480469 | 40.75520706176758 | 4.0 | 0.0 | -73.98250579833984 | 40.76253890991211 | 9.5 | 0.5 | 0.5 | 0.0 | 0.0 | 10.5 |
178,544,323 | CMT | 2012-12-24 09:57:17.000000000 | 2012-12-24 09:59:51.000000000 | 2 | CSH | 0.5 | -73.97518920898438 | 40.76108169555664 | 1.0 | 0.0 | -73.98072814941406 | 40.753299713134766 | 4.0 | 0.0 | 0.5 | 0.0 | 0.0 | 4.5 |
If you would rather download the raw CSV files straight from the TLC website, and convert them to HDF5 locally, you can do so by uncommenting and running the cell below.
# # List of files to download
# dlinks = [f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2012-{m:02}.csv' for m in range(1, 13)]
# # A consistent header names for all files
# header = ['vendor_id',
# 'pickup_datetime',
# 'dropoff_datetime',
# 'passenger_count',
# 'trip_distance',
# 'pickup_longitude',
# 'pickup_latitude',
# 'rate_code',
# 'store_and_fwd_flag',
# 'dropoff_longitude',
# 'dropoff_latitude',
# 'payment_type',
# 'fare_amount',
# 'surcharge',
# 'mta_tax',
# 'tip_amount',
# 'tolls_amount',
# 'total_amount',
# ]
# # Specify the data types of each column
# types = {'vendor_id': str,
# 'pickup_datetime': np.float32,
# 'dropoff_datetime': np.float32,
# 'passenger_count': np.int64,
# 'trip_distance': np.float32,
# 'pickup_longitude': np.float32,
# 'pickup_latitude': np.float32,
# 'rate_code': np.float32,
# 'store_and_fwd_flag': str,
# 'dropoff_longitude': np.float32,
# 'dropoff_latitude': np.float32,
# 'payment_type': str,
# 'fare_amount': np.float32,
# 'surcharge': np.float32,
# 'mta_tax': np.float32,
# 'tip_amount': np.float32,
# 'tolls_amount': np.float32,
# 'total_amount': np.float32,
# }
# for i, month in enumerate(tqdm(dlinks, desc='Downloading and converting raw data...')):
# # Read the CSV data with pandas directly for the TLC website
# df_tmp = pd.read_csv(month,
# parse_dates=['pickup_datetime', 'dropoff_datetime'],
# header=None,
# names=header,
# dtype=types,
# error_bad_lines=False,
# skiprows=1)
# # Convert to a vaex dataframe
# df = vaex.from_pandas(df_tmp)
# # Export to HDF5
# df.export_hdf5(path=f'yellow_tripdata_2012-{i:02}.hdf5', progress=True)
# # Clean up
# del df, df_tmp
# # Open all of the data as a single DataFrame
# df = df = vaex.open('*.hdf5')
In what follows we are adding pick-up and drop-off zone and borough information to the data.
import json
import collections
geo_filename = './aux_data/taxi_zones-tiny.json'
with open(geo_filename) as f:
geo_json = json.load(f)
features = geo_json['features']
borough_polygons = collections.defaultdict(list)
zone_polygons = collections.defaultdict(list)
zbmapper = {}
list_of_polygons = []
for i, feature in enumerate(features[:]):
properties = feature['properties']
geo = feature['geometry']
polygons = []
for polygon in geo['coordinates']:
polygon = np.array(polygon)
if polygon.ndim == 3:
polygon = polygon[0]
polygon = polygon.T
assert polygon.shape[0] == 2
assert polygon.ndim == 2
polygons.append(polygon)
borough_polygons[properties['borough']].extend(polygons)
zone_polygons[properties['zone']].extend(polygons)
zbmapper[properties['zone']] = properties['borough']
keys = list(borough_polygons.keys())
bmapper = {i:keys[i] for i in range(len(keys))}
keys = list(zone_polygons.keys())
zmapper = {i:keys[i] for i in range(len(keys))}
with open('./aux_data/zone.json', 'w') as f:
json.dump(zmapper, f)
with open('./aux_data/borough.json', 'w') as f:
json.dump(bmapper, f)
with open('./aux_data/zone_to_borough.json', 'w') as f:
json.dump(zbmapper, f)
# Add the borough and zone ids to the pickup and dropoff locations as virtual columns
df['dropoff_borough'] = df.geo.inside_which_polygons(df.dropoff_longitude, df.dropoff_latitude, borough_polygons.values()).astype('uint8')
df['dropoff_zone'] = df.geo.inside_which_polygons(df.dropoff_longitude, df.dropoff_latitude, zone_polygons.values()).astype('uint16')
df['pickup_borough'] = df.geo.inside_which_polygons(df.pickup_longitude, df.pickup_latitude, borough_polygons.values()).astype('uint8')
df['pickup_zone'] = df.geo.inside_which_polygons(df.pickup_longitude, df.pickup_latitude, zone_polygons.values()).astype('uint16')
df.head_and_tail_print(3)
# | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | payment_type | trip_distance | pickup_longitude | pickup_latitude | rate_code | store_and_fwd_flag | dropoff_longitude | dropoff_latitude | fare_amount | surcharge | mta_tax | tip_amount | tolls_amount | total_amount | dropoff_borough | dropoff_zone | pickup_borough | pickup_zone |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CMT | 2012-01-10 23:55:50.000000000 | 2012-01-11 00:03:39.000000000 | 1 | CRD | 1.7000000476837158 | -73.99468994140625 | 40.725032806396484 | 1.0 | 0.0 | -73.9759521484375 | 40.73078155517578 | 6.900000095367432 | 0.5 | 0.5 | 1.0 | 0.0 | 8.899999618530273 | 4 | 242 | 4 | 178 |
1 | CMT | 2012-01-11 19:18:25.000000000 | 2012-01-11 19:26:10.000000000 | 1 | CSH | 1.100000023841858 | -73.98795318603516 | 40.75294876098633 | 1.0 | 0.0 | -73.9945297241211 | 40.76103973388672 | 6.099999904632568 | 1.0 | 0.5 | 0.0 | 0.0 | 7.599999904632568 | 4 | 140 | 4 | 48 |
2 | CMT | 2012-01-11 19:19:19.000000000 | 2012-01-11 19:48:15.000000000 | 2 | CRD | 18.0 | -73.78309631347656 | 40.6485481262207 | 2.0 | 0.0 | -73.99613189697266 | 40.747623443603516 | 45.0 | 0.0 | 0.5 | 10.0600004196167 | 4.800000190734863 | 60.36000061035156 | 4 | 89 | 0 | 61 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
178,544,321 | CMT | 2012-12-24 21:09:56.000000000 | 2012-12-24 21:13:22.000000000 | 1 | CSH | 0.30000001192092896 | -73.96232604980469 | 40.77641296386719 | 1.0 | 0.0 | -73.95834350585938 | 40.774906158447266 | 4.0 | 0.5 | 0.5 | 0.0 | 0.0 | 5.0 | 4 | 74 | 4 | 74 |
178,544,322 | CMT | 2012-12-24 20:07:23.000000000 | 2012-12-24 20:20:18.000000000 | 1 | CSH | 1.100000023841858 | -73.97209167480469 | 40.75520706176758 | 4.0 | 0.0 | -73.98250579833984 | 40.76253890991211 | 9.5 | 0.5 | 0.5 | 0.0 | 0.0 | 10.5 | 4 | 234 | 4 | 201 |
178,544,323 | CMT | 2012-12-24 09:57:17.000000000 | 2012-12-24 09:59:51.000000000 | 2 | CSH | 0.5 | -73.97518920898438 | 40.76108169555664 | 1.0 | 0.0 | -73.98072814941406 | 40.753299713134766 | 4.0 | 0.0 | 0.5 | 0.0 | 0.0 | 4.5 | 4 | 187 | 4 | 64 |
(Optional) We can materialize the virtual columns to gain in speed.
df.materialize(virtual_column='dropoff_borough', inplace=True)
df.materialize(virtual_column='dropoff_zone', inplace=True)
df.materialize(virtual_column='pickup_borough', inplace=True)
df.materialize(virtual_column='pickup_zone', inplace=True)
# | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | payment_type | trip_distance | pickup_longitude | pickup_latitude | rate_code | store_and_fwd_flag | dropoff_longitude | dropoff_latitude | fare_amount | surcharge | mta_tax | tip_amount | tolls_amount | total_amount | dropoff_borough | dropoff_zone | pickup_borough | pickup_zone |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CMT | 2012-01-10 23:55:50.000000000 | 2012-01-11 00:03:39.000000000 | 1 | CRD | 1.7000000476837158 | -73.99468994140625 | 40.725032806396484 | 1.0 | 0.0 | -73.9759521484375 | 40.73078155517578 | 6.900000095367432 | 0.5 | 0.5 | 1.0 | 0.0 | 8.899999618530273 | 4 | 242 | 4 | 178 |
1 | CMT | 2012-01-11 19:18:25.000000000 | 2012-01-11 19:26:10.000000000 | 1 | CSH | 1.100000023841858 | -73.98795318603516 | 40.75294876098633 | 1.0 | 0.0 | -73.9945297241211 | 40.76103973388672 | 6.099999904632568 | 1.0 | 0.5 | 0.0 | 0.0 | 7.599999904632568 | 4 | 140 | 4 | 48 |
2 | CMT | 2012-01-11 19:19:19.000000000 | 2012-01-11 19:48:15.000000000 | 2 | CRD | 18.0 | -73.78309631347656 | 40.6485481262207 | 2.0 | 0.0 | -73.99613189697266 | 40.747623443603516 | 45.0 | 0.0 | 0.5 | 10.0600004196167 | 4.800000190734863 | 60.36000061035156 | 4 | 89 | 0 | 61 |
3 | CMT | 2012-01-11 19:19:21.000000000 | 2012-01-11 19:27:00.000000000 | 1 | CRD | 1.7000000476837158 | -73.96751403808594 | 40.758453369140625 | 1.0 | 0.0 | -73.95658111572266 | 40.779903411865234 | 6.900000095367432 | 1.0 | 0.5 | 1.0 | 0.0 | 9.399999618530273 | 4 | 74 | 4 | 68 |
4 | CMT | 2012-01-11 14:38:15.000000000 | 2012-01-11 14:43:51.000000000 | 1 | CSH | 1.2000000476837158 | -74.01131439208984 | 40.711448669433594 | 1.0 | 0.0 | -74.00286865234375 | 40.72813034057617 | 5.699999809265137 | 0.0 | 0.5 | 0.0 | 0.0 | 6.199999809265137 | 4 | 29 | 4 | 260 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
178,544,319 | CMT | 2012-12-26 20:55:55.000000000 | 2012-12-26 21:27:18.000000000 | 1 | CSH | 21.100000381469727 | -73.77667999267578 | 40.64518737792969 | 1.0 | 0.0 | -74.02727508544922 | 40.61878204345703 | 56.0 | 0.5 | 0.5 | 0.0 | 0.0 | 57.0 | 2 | 114 | 0 | 61 |
178,544,320 | CMT | 2012-12-24 07:10:44.000000000 | 2012-12-24 07:16:48.000000000 | 1 | CSH | 2.0999999046325684 | -73.99629211425781 | 40.73793411254883 | 1.0 | 0.0 | -73.97488403320312 | 40.75730514526367 | 8.0 | 0.0 | 0.5 | 0.0 | 0.0 | 8.5 | 4 | 64 | 4 | 217 |
178,544,321 | CMT | 2012-12-24 21:09:56.000000000 | 2012-12-24 21:13:22.000000000 | 1 | CSH | 0.30000001192092896 | -73.96232604980469 | 40.77641296386719 | 1.0 | 0.0 | -73.95834350585938 | 40.774906158447266 | 4.0 | 0.5 | 0.5 | 0.0 | 0.0 | 5.0 | 4 | 74 | 4 | 74 |
178,544,322 | CMT | 2012-12-24 20:07:23.000000000 | 2012-12-24 20:20:18.000000000 | 1 | CSH | 1.100000023841858 | -73.97209167480469 | 40.75520706176758 | 4.0 | 0.0 | -73.98250579833984 | 40.76253890991211 | 9.5 | 0.5 | 0.5 | 0.0 | 0.0 | 10.5 | 4 | 234 | 4 | 201 |
178,544,323 | CMT | 2012-12-24 09:57:17.000000000 | 2012-12-24 09:59:51.000000000 | 2 | CSH | 0.5 | -73.97518920898438 | 40.76108169555664 | 1.0 | 0.0 | -73.98072814941406 | 40.753299713134766 | 4.0 | 0.0 | 0.5 | 0.0 | 0.0 | 4.5 | 4 | 187 | 4 | 64 |
Let us create some features of interest that will be important for the web application.
# Time in transit (minutes) - This is the target variable
df['trip_duration_min'] = (df.dropoff_datetime - df.pickup_datetime) / np.timedelta64(1, 'm')
df['trip_duration_min'] = df.trip_duration_min.astype('float32')
# Speed (miles per hour) - To be used for cleaning of the training data
df['trip_speed_mph'] = df.trip_distance / ((df.dropoff_datetime - df.pickup_datetime) / np.timedelta64(1, 'h'))
df['trip_speed_mph'] = df.trip_speed_mph.astype('float32')
# Extract pickup hour, day and month
df['pickup_hour'] = df.pickup_datetime.dt.hour.astype('int8')
df['pickup_day'] = df.pickup_datetime.dt.dayofweek.astype('int8')
There are numerous outliers in the data, which we decide to filter out. If you are interested in the rationale behind the following filters, please read through this article.
# Drop missing values
df = df.dropna(column_names=['dropoff_longitude', 'dropoff_latitude'])
# Filter abnormal number of passengers
df = df[(df.passenger_count>0) & (df.passenger_count<7)]
# Select taxi trips have travelled maximum 7 miles (but also with non-zero distance).
df = df[(df.trip_distance > 0) & (df.trip_distance < 10)]
# Filter taxi trips that have durations longer than 25 minutes or that lasted less than 3 minutes
df = df[(df.trip_duration_min > 1) & (df.trip_duration_min < 60)]
# Filter out errouneous average trip speeds.
df = df[(df.trip_speed_mph > 1) & (df.trip_speed_mph < 60)]
# Select only valid rate code
df = df[(df.trip_distance >= 1) & (df.trip_distance <= 6 )]
# Select only trips that were paid for by cash or card
df = df[df.payment_type.isin(['CSH', 'CRD'])]
# Drop missing pickup and dropoff locations
df = df.dropmissing(['dropoff_borough', 'dropoff_zone', 'pickup_borough', 'pickup_zone'])
# # Make a selection based on the boundaries
# # Define the NYC boundaries
# long_min = -74.05
# long_max = -73.75
# lat_min = 40.58
# lat_max = 40.90
# df = df[(df.pickup_longitude > long_min) & (df.pickup_longitude < long_max) & \
# (df.pickup_latitude > lat_min) & (df.pickup_latitude < lat_max) & \
# (df.dropoff_longitude > long_min) & (df.dropoff_longitude < long_max) & \
# (df.dropoff_latitude > lat_min) & (df.dropoff_latitude < lat_max)]
See the number of taxi trips in the filtered DataFrame:
print(f'The filtered DataFrame contains {len(df):,} records.')
<string>:1: RuntimeWarning: divide by zero encountered in true_divide <string>:1: RuntimeWarning: invalid value encountered in true_divide <string>:1: RuntimeWarning: invalid value encountered in greater <string>:1: RuntimeWarning: invalid value encountered in less
The filtered DataFrame contains 117,444,049 records.
Sort the data by pickup location, to achieve more organized data locallity, thus marginally improving performance of the dashboard.
df = df.sort(by=['pickup_zone', 'pickup_longitude', 'pickup_latitude'])
Export the data to disk.
df.export_hdf5('/data/taxi/nyc_taxi_2012.hdf5', progress=True)
[########################################] 99.99% estimated time: 0.01s = 0.0m = 0.0h
# Read in the exported dataset
df = vaex.open('/data/taxi/nyc_taxi_2012.hdf5')
df
# | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | payment_type | trip_distance | pickup_longitude | pickup_latitude | rate_code | store_and_fwd_flag | dropoff_longitude | dropoff_latitude | fare_amount | surcharge | mta_tax | tip_amount | tolls_amount | total_amount | dropoff_borough | dropoff_zone | pickup_borough | pickup_zone | trip_duration_min | trip_speed_mph | pickup_hour | pickup_day |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | VTS | 2012-03-31 18:13:00.000000000 | 2012-03-31 18:24:00.000000000 | 1 | CRD | 2.2300000190734863 | -73.87706756591797 | 40.61756134033203 | 1.0 | nan | -73.86943817138672 | 40.62184524536133 | 8.100000381469727 | 0.0 | 0.5 | 1.5 | 0.0 | 10.100000381469727 | 0 | 0 | 0 | 0 | 11.0 | 12.163636207580566 | 18 | 5 |
1 | VTS | 2012-01-31 02:17:00.000000000 | 2012-01-31 02:26:00.000000000 | 1 | CSH | 3.559999942779541 | -73.87632751464844 | 40.61985397338867 | 1.0 | nan | -73.86920166015625 | 40.62523651123047 | 10.5 | 0.5 | 0.5 | 0.0 | 0.0 | 11.5 | 0 | 0 | 0 | 0 | 9.0 | 23.733333587646484 | 2 | 1 |
2 | VTS | 2012-03-24 20:19:00.000000000 | 2012-03-24 20:24:00.000000000 | 1 | CRD | 1.149999976158142 | -73.87577056884766 | 40.621028900146484 | 1.0 | nan | -73.85384368896484 | 40.62257385253906 | 5.300000190734863 | 0.5 | 0.5 | 1.159999966621399 | 0.0 | 7.460000038146973 | 0 | 0 | 0 | 0 | 5.0 | 13.799999237060547 | 20 | 5 |
3 | VTS | 2012-09-09 00:57:00.000000000 | 2012-09-09 01:13:00.000000000 | 1 | CSH | 4.190000057220459 | -73.87551879882812 | 40.61985778808594 | 1.0 | nan | -73.89090728759766 | 40.63335037231445 | 15.5 | 0.5 | 0.5 | 0.0 | 0.0 | 16.5 | 2 | 5 | 0 | 0 | 16.0 | 15.71250057220459 | 0 | 6 |
4 | VTS | 2012-04-24 18:59:00.000000000 | 2012-04-24 19:04:00.000000000 | 1 | CRD | 1.1799999475479126 | -73.87530517578125 | 40.62031555175781 | 1.0 | nan | -73.87605285644531 | 40.61798858642578 | 5.300000190734863 | 1.0 | 0.5 | 1.2599999904632568 | 0.0 | 8.0600004196167 | 0 | 0 | 0 | 0 | 5.0 | 14.15999984741211 | 18 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
117,444,044 | VTS | 2012-10-20 05:38:00.000000000 | 2012-10-20 05:49:00.000000000 | 1 | CRD | 4.349999904632568 | -74.00861358642578 | 40.711395263671875 | 1.0 | nan | -73.98361206054688 | 40.76372146606445 | 13.5 | 0.5 | 0.5 | 3.5 | 0.0 | 18.0 | 4 | 234 | 4 | 260 | 11.0 | 23.727272033691406 | 5 | 5 |
117,444,045 | VTS | 2012-02-24 15:12:00.000000000 | 2012-02-24 15:28:00.000000000 | 4 | CSH | 3.609999895095825 | -74.00860595703125 | 40.71139144897461 | 1.0 | nan | -73.99629211425781 | 40.75292205810547 | 12.5 | 0.0 | 0.5 | 0.0 | 0.0 | 13.0 | 4 | 150 | 4 | 260 | 16.0 | 13.53749942779541 | 15 | 4 |
117,444,046 | CMT | 2012-03-04 17:28:40.000000000 | 2012-03-04 17:45:05.000000000 | 1 | CSH | 4.0 | -74.00860595703125 | 40.71139144897461 | 1.0 | 0.0 | -73.97952270507812 | 40.76235580444336 | 12.100000381469727 | 0.0 | 0.5 | 0.0 | 0.0 | 12.600000381469727 | 4 | 234 | 4 | 260 | 16.41666603088379 | 14.61928939819336 | 17 | 6 |
117,444,047 | VTS | 2012-08-18 19:26:00.000000000 | 2012-08-18 19:43:00.000000000 | 1 | CRD | 4.78000020980835 | -74.00860595703125 | 40.71139144897461 | 1.0 | nan | -73.98592376708984 | 40.76795196533203 | 14.100000381469727 | 0.0 | 0.5 | 1.0 | 0.0 | 15.600000381469727 | 4 | 139 | 4 | 260 | 17.0 | 16.870588302612305 | 19 | 5 |
117,444,048 | CMT | 2012-09-09 18:47:12.000000000 | 2012-09-09 18:58:14.000000000 | 1 | CRD | 2.4000000953674316 | -74.00860595703125 | 40.71139144897461 | 1.0 | 0.0 | -73.99994659423828 | 40.741336822509766 | 10.0 | 0.0 | 0.5 | 2.0999999046325684 | 0.0 | 12.600000381469727 | 4 | 9 | 4 | 260 | 11.033333778381348 | 13.051360130310059 | 18 | 6 |
# Check dtypes
df.dtypes
vendor_id <class 'str'> pickup_datetime datetime64[ns] dropoff_datetime datetime64[ns] passenger_count int64 payment_type <class 'str'> trip_distance float32 pickup_longitude float32 pickup_latitude float32 rate_code float32 store_and_fwd_flag float32 dropoff_longitude float32 dropoff_latitude float32 fare_amount float32 surcharge float32 mta_tax float32 tip_amount float32 tolls_amount float32 total_amount float32 dropoff_borough uint8 dropoff_zone uint16 pickup_borough uint8 pickup_zone uint16 trip_duration_min float32 trip_speed_mph float32 pickup_hour int8 pickup_day int8 dtype: object
For convenience we have also placed this exported dataset on S3, and you can download it locally simply by:
df = vaex.open('s3://vaex/taxi/yellow_taxi_2012_zones.hdf5?anon=true')