#!/usr/bin/env python
# coding: utf-8

# In[1]:


# This notebook expects that Modin and Ray are installed, e.g. by `pip install modin[ray]`.
# For all ways to install Modin see official documentation at:
# https://modin.readthedocs.io/en/latest/installation.html

# NOTE: this is special version for showing cloud-cluster functionality.
# It requires installation of extra packages: `pip install cloudpickle rpyc`
# Also if your environment requires proxy for SSH you need to expose it via MODIN_SOCKS_PROXY environment variable,
# please note that it requires ray >= 0.8.7 to work
import modin.experimental.pandas as pd
from modin.experimental.cloud import create_cluster


# In[2]:


columns_names = [
        "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
        "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
        "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
        "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
        "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
        "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
        "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
        "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
        "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
        "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
        "dropoff_ntaname", "dropoff_puma",
    ]
parse_dates=["pickup_datetime", "dropoff_datetime"]


# In[3]:


with create_cluster('aws', '../../../aws_credentials',
                    cluster_name="rayscale-test",
                    region="eu-north-1", zone="eu-north-1b", image="ami-00e1e82d7d4ca80d3") as remote:
    df = pd.read_csv('https://modin-datasets.s3.amazonaws.com/trips_data.csv', names=columns_names,
                    header=None, parse_dates=parse_dates)


# In[4]:


with remote:
    print(df)


# In[5]:


def q1(df):
    return df.groupby("cab_type")["cab_type"].count()
def q2(df):
    return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]
def q3(df):
    return df.groupby(["passenger_count", "pickup_datetime"]).size().reset_index()
def q4(df):
    transformed = pd.DataFrame({
        "passenger_count": df["passenger_count"],
        "pickup_datetime": df["pickup_datetime"].dt.year,
        "trip_distance": df["trip_distance"].astype("int64"),
    })
    return transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"])  \
            .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])


# In[6]:


with remote:
    for query in (q1, q2, q3, q4):
        print(query(df))


# In[ ]: