#!/usr/bin/env python
# coding: utf-8

# # Most Time Spent
# 
# Let's say we have data from [NYC City Bike Data](https://s3.amazonaws.com/tripdata/index.html)
# 
# We have a `DataFrame` with `start` and `end` time. We'd like to know for each ride where it spent most of the time - `morning`, `noon`, `evening` or `night`.
# 
# We're going to convert time of day to minutes since midnight, then we can look at `(start, end)` as line and find overlap between then.
# 
# ---
# Credits to Pietro Battiston who gave the basis for this [on stackoverflow](https://stackoverflow.com/questions/46314769/label-rows-by-most-time-spent-in-a-time-interval/46372005#46372005).

# In[1]:


import numpy as np
import pandas as pd


# ### Load Data

# In[2]:


start_col, end_col = 'starttime', 'stoptime'

# Loading just first 10,000 rows
df = pd.read_csv('201501-citibike-tripdata.csv', parse_dates=[start_col, end_col], nrows=10000)


# ### Parts of Day

# In[3]:


@np.vectorize
def minutes(time):
    """Convert time to minutes since 00:00

    >>> minutes('3:17')
    197
    """
    # Work on strings as well
    if isinstance(time, str):
        time = pd.Timestamp(time)
        
    return (time.hour * 60) + time.minute


# Parts of day, we can have overlaps here
parts_of_day = [
    ('morning', minutes('05:00am'),  minutes('11:00')),
    ('noon', minutes('11:00am'), minutes('14:00')),
    ('evening', minutes('14:00'), minutes('19:00')),
    # Night is two continuous parts
    ('night_a', minutes('18:30'), minutes('23:59')),
    ('night_b', minutes('00:00'), minutes('6:00')),
]

# Auxiliary DataFrame
parts_df = pd.DataFrame(index=df.index)
for col in [start_col, end_col]:
    parts_df[col] = minutes(df[col].dt.time)

for name, start, end in parts_of_day:
    parts_df['start_p'] = start
    parts_df['end_p'] = end
    ends = parts_df[[end_col, 'end_p']].min(axis=1)
    starts = parts_df[[start_col, 'start_p']].max(axis=1)
    duration = ends - starts
    parts_df.loc[duration >= 0, name] = duration[duration >= 0]

parts_df['night'] = parts_df['night_a'] + parts_df['night_b']
parts_df.fillna(0, inplace=True)

# Assign part of day where most time was spent
df['day_part'] = parts_df[['morning', 'noon', 'evening', 'night']].idxmax(axis=1)


# ### Sample Output

# In[4]:


df['day_part'].sample(20)


# In[ ]: