# This notebook expects that Modin and Ray are installed, e.g. by `pip install modin[ray]`.
# For all ways to install Modin see official documentation at:
# https://modin.readthedocs.io/en/latest/installation.html
import modin.pandas as pd
columns_names = [
"trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
"rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
"passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
"tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
"trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
"max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
"pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
"pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
"dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
"dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
"dropoff_ntaname", "dropoff_puma",
]
parse_dates=["pickup_datetime", "dropoff_datetime"]
df = pd.read_csv('https://modin-datasets.s3.amazonaws.com/trips_data.csv', names=columns_names,
header=None, parse_dates=parse_dates)
UserWarning: Parameters provided defaulting to pandas implementation. To request implementation, send an email to feature_requests@modin.org.
df
trip_id | vendor_id | pickup_datetime | dropoff_datetime | store_and_fwd_flag | rate_code_id | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | ... | dropoff_nyct2010_gid | dropoff_ctlabel | dropoff_borocode | dropoff_boroname | dropoff_ct2010 | dropoff_boroct2010 | dropoff_cdeligibil | dropoff_ntacode | dropoff_ntaname | dropoff_puma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 2013-08-01 08:14:37 | 2013-08-01 09:09:06 | N | 1 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2 | 2 | 2013-08-01 09:13:00 | 2013-08-01 11:38:00 | N | 1 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 3 | 2 | 2013-08-01 09:48:00 | 2013-08-01 09:49:00 | N | 5 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 4 | 2 | 2013-08-01 10:38:35 | 2013-08-01 10:38:51 | N | 1 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 5 | 2 | 2013-08-01 11:51:45 | 2013-08-01 12:03:52 | N | 1 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 9881 | 2 | 2013-09-29 01:03:48 | 2013-09-29 01:09:49 | N | 1 | -73.958336 | 40.820820 | -73.953773 | 40.825195 | ... | 907.0 | 225.0 | 1.0 | Manhattan | 22500.0 | 1022500.0 | E | MN04 | Hamilton Heights | 3802.0 |
9996 | 9882 | 2 | 2013-09-29 03:04:10 | 2013-09-29 03:09:37 | N | 1 | -73.958824 | 40.820251 | -73.934174 | 40.853394 | ... | 912.0 | 271.0 | 1.0 | Manhattan | 27100.0 | 1027100.0 | E | MN35 | Washington Heights North | 3801.0 |
9997 | 9883 | 2 | 2013-09-30 16:28:12 | 2013-09-30 16:56:03 | N | 1 | -73.956100 | 40.818974 | -73.941055 | 40.789993 | ... | 1318.0 | 170.0 | 1.0 | Manhattan | 17000.0 | 1017000.0 | E | MN33 | East Harlem South | 3804.0 |
9998 | 9884 | 2 | 2013-09-01 13:15:15 | 2013-09-01 13:23:10 | N | 1 | -73.955345 | 40.820053 | -73.942444 | 40.841507 | ... | 911.0 | 251.0 | 1.0 | Manhattan | 25100.0 | 1025100.0 | E | MN36 | Washington Heights South | 3801.0 |
9999 | 9885 | 2 | 2013-09-20 07:32:17 | 2013-09-20 08:01:06 | N | 1 | -73.955353 | 40.820213 | -73.957680 | 40.765190 | ... | 1758.0 | 116.0 | 1.0 | Manhattan | 11600.0 | 1011600.0 | I | MN31 | Lenox Hill-Roosevelt Island | 3805.0 |
10000 rows x 51 columns
def q1(df):
return df.groupby("cab_type")["cab_type"].count()
def q2(df):
return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]
def q3(df):
return df.groupby(["passenger_count", "pickup_datetime"]).size().reset_index()
def q4(df):
transformed = pd.DataFrame({
"passenger_count": df["passenger_count"],
"pickup_datetime": df["pickup_datetime"].dt.year,
"trip_distance": df["trip_distance"].astype("int64"),
})
return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \
.size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])
q1(df)
10000
q2(df)
passenger_count | total_amount | |
---|---|---|
0 | 0 | 18.333333 |
1 | 1 | 15.258850 |
2 | 2 | 20.332356 |
3 | 3 | 13.748845 |
4 | 4 | 19.742688 |
5 | 5 | 14.786221 |
6 | 6 | 15.400085 |
q3(df)
passenger_count | pickup_datetime | 0 | |
---|---|---|---|
0 | 0 | 2013-08-14 12:07:00 | 1 |
1 | 0 | 2013-08-14 12:37:00 | 1 |
2 | 0 | 2013-08-15 00:00:00 | 1 |
3 | 1 | 2013-08-01 08:14:37 | 1 |
4 | 1 | 2013-08-01 09:48:00 | 1 |
... | ... | ... | ... |
9909 | 6 | 2013-09-28 18:30:15 | 1 |
9910 | 6 | 2013-09-28 19:57:22 | 1 |
9911 | 6 | 2013-09-29 18:47:29 | 1 |
9912 | 6 | 2013-09-30 02:27:33 | 1 |
9913 | 6 | 2013-09-30 21:31:06 | 1 |
9914 rows x 3 columns
q4(df)
passenger_count | pickup_datetime | trip_distance | 0 | |
---|---|---|---|---|
2 | 1 | 2013 | 0 | 1991 |
3 | 1 | 2013 | 1 | 1270 |
4 | 1 | 2013 | 2 | 853 |
80 | 5 | 2013 | 0 | 551 |
81 | 5 | 2013 | 1 | 537 |
... | ... | ... | ... | ... |
77 | 4 | 2013 | 10 | 1 |
78 | 4 | 2013 | 11 | 1 |
79 | 4 | 2013 | 14 | 1 |
102 | 5 | 2013 | 28 | 1 |
115 | 6 | 2013 | 14 | 1 |
116 rows x 4 columns
%timeit q1(df)
%timeit q2(df)
%timeit q3(df)
%timeit q4(df)