#!/usr/bin/env python # coding: utf-8 # In[1]: # This notebook expects that Modin and Ray are installed, e.g. by `pip install modin[ray]`. # For all ways to install Modin see official documentation at: # https://modin.readthedocs.io/en/latest/installation.html # NOTE: this is special version for showing cloud-cluster functionality. # It requires installation of extra packages: `pip install cloudpickle rpyc` # Also if your environment requires proxy for SSH you need to expose it via MODIN_SOCKS_PROXY environment variable, # please note that it requires ray >= 0.8.7 to work import modin.experimental.pandas as pd from modin.experimental.cloud import create_cluster # In[2]: columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag", "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type", "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall", "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid", "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010", "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma", "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname", "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode", "dropoff_ntaname", "dropoff_puma", ] parse_dates=["pickup_datetime", "dropoff_datetime"] # In[3]: with create_cluster('aws', '../../../aws_credentials', cluster_name="rayscale-test", region="eu-north-1", zone="eu-north-1b", image="ami-00e1e82d7d4ca80d3") as remote: df = pd.read_csv('https://modin-datasets.s3.amazonaws.com/trips_data.csv', names=columns_names, header=None, parse_dates=parse_dates) # In[4]: with remote: print(df) # In[5]: def q1(df): return df.groupby("cab_type")["cab_type"].count() def q2(df): return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]] def q3(df): return df.groupby(["passenger_count", "pickup_datetime"]).size().reset_index() def q4(df): transformed = pd.DataFrame({ "passenger_count": df["passenger_count"], "pickup_datetime": df["pickup_datetime"].dt.year, "trip_distance": df["trip_distance"].astype("int64"), }) return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"]) \ .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False]) # In[6]: with remote: for query in (q1, q2, q3, q4): print(query(df)) # In[ ]: